Python rdkit.Chem.SDMolSupplier() Examples
The following are 15
code examples of rdkit.Chem.SDMolSupplier().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
rdkit.Chem
, or try the search function
.
Example #1
Source File: converter.py From 3DGCN with MIT License | 7 votes |
def rotate_molecule(path, target_path, count=10): # Load dataset mols = Chem.SDMolSupplier(path) rotated_mols = [] print("Loaded {} Molecules from {}".format(len(mols), path)) print("Rotating Molecules...") for mol in mols: for _ in range(count): for atom in mol.GetAtoms(): atom_idx = atom.GetIdx() pos = list(mol.GetConformer().GetAtomPosition(atom_idx)) pos_rotated = np.matmul(random_rotation_matrix(), pos) mol.GetConformer().SetAtomPosition(atom_idx, pos_rotated) rotated_mols.append(mol) w = Chem.SDWriter(target_path) for m in rotated_mols: if m is not None: w.write(m) print("Saved {} Molecules to {}".format(len(rotated_mols), target_path))
Example #2
Source File: curve.py From 3DGCN with MIT License | 6 votes |
def draw_confusion_matrix(dataset, model, set_trial=None, filename="test_results.sdf"): path = find_average_trial(dataset, model, metric="test_pr") if set_trial is None \ else "../result/{}/{}/{}/".format(model, dataset, set_trial) # Load true, pred value true_y, pred_y = [], [] mols = Chem.SDMolSupplier(path + filename) for mol in mols: true_y.append(float(mol.GetProp("true"))) pred_y.append(float(mol.GetProp("pred"))) true_y = np.array(true_y, dtype=float) pred_y = np.array(pred_y, dtype=float).round() # Get precision and recall confusion = confusion_matrix(true_y, pred_y) tn, fp, fn, tp = confusion.ravel() print("tn: {}, fp: {}, fn: {}, tp: {}".format(tn, fp, fn, tp))
Example #3
Source File: save.py From PADME with MIT License | 6 votes |
def load_sdf_files(input_files, clean_mols): """Load SDF file into dataframe.""" dataframes = [] for input_file in input_files: # Tasks are stored in .sdf.csv file raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None)) # Structures are stored in .sdf file print("Reading structures from %s." % input_file) suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False) df_rows = [] for ind, mol in enumerate(suppl): if mol is not None: smiles = Chem.MolToSmiles(mol) df_rows.append([ind, smiles, mol]) mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol')) dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner')) return dataframes
Example #4
Source File: Getmol.py From PyBioMed with BSD 3-Clause "New" or "Revised" License | 6 votes |
def ReadMolFromSDF(filename=""): """ Read a set of molecules by SDF file format. Note: the output of this function is a set of molecular objects. You need to use for statement to call each object. Usage: res=ReadMolFromSDF(filename) Input: filename is a file name with path. Output: res is a set of molecular object. """ molset = Chem.SDMolSupplier(filename) return molset
Example #5
Source File: sdf_file_parser.py From chainer-chemistry with MIT License | 6 votes |
def extract_total_num(self, filepath): """Extracts total number of data which can be parsed We can use this method to determine the value fed to `target_index` option of `parse` method. For example, if we want to extract input feature from 10% of whole dataset, we need to know how many samples are in a file. The returned value of this method may not to be same as the final dataset size. Args: filepath (str): file path of to check the total number. Returns (int): total number of dataset can be parsed. """ mol_supplier = Chem.SDMolSupplier(filepath) return len(mol_supplier)
Example #6
Source File: predict_enriched.py From PIDGINv3 with GNU General Public License v3.0 | 6 votes |
def importQuerySDF(in_file): outfp = [] outmol = [] query = Chem.SDMolSupplier(in_file) for idx, m in enumerate(suppl): sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx) sys.stdout.flush() try: if not m: raise SdfNoneMolError('None mol') smi, fp, mol = calcFingerprints(m,qtype='sdf') outfp.append(fp) outmol.append(mol) except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')' print return np.array(outfp,dtype=np.uint8),outmol #unzip a pkl model
Example #7
Source File: predict.py From PIDGINv3 with GNU General Public License v3.0 | 6 votes |
def importQuerySDF(in_file): outfp = [] outid= [] outmol = [] query = Chem.SDMolSupplier(in_file) for idx, m in enumerate(suppl): sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx) sys.stdout.flush() try: if not m: raise SdfNoneMolError('None mol') smi, fp, mol = calcFingerprints(m,qtype='sdf') try: outid.append(m.GetProp('_Name')) except KeyError: outid.append(smi) outfp.append(fp) outmol.append(mol) except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')' print return np.array(outfp,dtype=np.uint8),outmol,outid #unzip a pkl model
Example #8
Source File: dataset.py From 3DGCN with MIT License | 5 votes |
def replace_dataset(self, path, subset="test", target_name="target"): x, c, y = [], [], [] mols = Chem.SDMolSupplier(path) for mol in mols: if mol is not None: # Multitask if type(target_name) is list: y.append([float(mol.GetProp(t)) if t in mol.GetPropNames() else -1 for t in target_name]) self.outputs = len(self.target_name) # Singletask elif target_name in mol.GetPropNames(): _y = float(mol.GetProp(target_name)) if _y == -1: continue else: y.append(_y) else: continue x.append(mol) c.append(mol.GetConformer().GetPositions()) # Normalize x = np.array(x) c = np.array(c) y = (np.array(y) - self.mean) / self.std self.x[subset] = x self.c[subset] = c self.y[subset] = y.astype(int) if self.task != "regression" else y
Example #9
Source File: scatter_plot.py From 3DGCN with MIT License | 5 votes |
def find_confusion(dataset, base_path): for i in range(1, 11): path = base_path + "trial_{}/".format(i) # Load true, pred value true_y, pred_y, diff_y = [], [], [] mols = Chem.SDMolSupplier(path + "test.sdf") for mol in mols: diff_y.append(float(mol.GetProp("true")) - float(mol.GetProp("pred"))) diff_y = np.array(diff_y, dtype=float) # Find largest, smallest error molecules idx = np.argsort(diff_y) top_1 = mols[int(idx[-1])] top_2 = mols[int(idx[-2])] btm_1 = mols[int(idx[0])] btm_2 = mols[int(idx[1])] best_idx = np.argsort(np.abs(diff_y)) best = mols[int(best_idx[0])] # Save example molecules writer = Chem.SDWriter(path + "confusion_examples_" + dataset + "_trial" + str(i) + ".sdf") for mol in [top_1, top_2, btm_1, btm_2, best]: writer.write(mol)
Example #10
Source File: save.py From deepchem with MIT License | 5 votes |
def load_sdf_files(input_files, clean_mols, tasks=[]): """Load SDF file into dataframe.""" from rdkit import Chem dataframes = [] for input_file in input_files: # Tasks are either in .sdf.csv file or in the .sdf file itself has_csv = os.path.isfile(input_file + ".csv") # Structures are stored in .sdf file print("Reading structures from %s." % input_file) suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False) df_rows = [] for ind, mol in enumerate(suppl): if mol is None: continue smiles = Chem.MolToSmiles(mol) df_row = [ind, smiles, mol] if not has_csv: # Get task targets from .sdf file for task in tasks: df_row.append(mol.GetProp(str(task))) df_rows.append(df_row) if has_csv: mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol')) raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None)) dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner')) else: mol_df = pd.DataFrame( df_rows, columns=('mol_id', 'smiles', 'mol') + tuple(tasks)) dataframes.append(mol_df) return dataframes
Example #11
Source File: rdk.py From oddt with BSD 3-Clause "New" or "Revised" License | 5 votes |
def readstring(format, string, **kwargs): """Read in a molecule from a string. Required parameters: format - see the informats variable for a list of available input formats string Example: >>> input = "C1=CC=CS1" >>> mymol = readstring("smi", input) >>> len(mymol.atoms) 5 """ string = str(string) format = format.lower() if format in ["mol", "sdf"]: supplier = Chem.SDMolSupplier() supplier.SetData(string) mol = next(supplier) del supplier elif format == "mol2": mol = Chem.MolFromMol2Block(string, **kwargs) elif format == "pdb": mol = MolFromPDBBlock(string, **kwargs) elif format == 'pdbqt': mol = MolFromPDBQTBlock(string, **kwargs) elif format == "smi": s = string.strip().split('\n')[0].strip().split() mol = Chem.MolFromSmiles(s[0], **kwargs) if mol: mol.SetProp("_Name", ' '.join(s[1:])) elif format == 'inchi' and Chem.INCHI_AVAILABLE: mol = Chem.inchi.MolFromInchi(string, **kwargs) else: raise ValueError("%s is not a recognised RDKit format" % format) return Molecule(mol)
Example #12
Source File: spectra_predictor.py From deep-molecular-massspec with Apache License 2.0 | 5 votes |
def get_mol_list_from_sdf(sdf_fname): """Reads a sdf file and returns a list of molecules. Note: rdkit's Chem.SDMolSupplier only accepts filenames as inputs. As such this code only supports local filesystem name environments. Args: sdf_fname: Path to sdf file. Returns: List of rdkit.Mol objects. Raises: ValueError if a molblock in the SDF cannot be parsed. """ suppl = Chem.SDMolSupplier(sdf_fname) mols = [] for idx, mol in enumerate(suppl): if mol is not None: mols.append(mol) else: fail_sdf_block = suppl.GetItemText(idx) raise ValueError("Unable to parse the following mol block %s" % fail_sdf_block) return mols
Example #13
Source File: chopRDKit03.py From eMolFrag with GNU General Public License v3.0 | 5 votes |
def FragmentSanitize(tempSDFPath): try: suppl2 = Chem.SDMolSupplier(tempSDFPath,sanitize=True) newmol2=Chem.FragmentOnBRICSBonds(suppl2[0]) mfl=Chem.GetMolFrags(newmol2,asMols=True,sanitizeFrags=False) #print('Good True') return mfl except: #print('Not good for true') raise RDKitError(1)
Example #14
Source File: converter.py From 3DGCN with MIT License | 4 votes |
def converter(path, target_path, name, target_name, process=20): # Load dataset print("Loading Dataset...") if ".csv" in path: x, y = load_csv(path, name, target_name) mols, props = [], [] for smi, prop in zip(x, y): mol = Chem.MolFromSmiles(smi) if mol is not None: mols.append(mol) props.append(prop) mol_idx = list(range(len(mols))) elif ".sdf" in path: mols = Chem.SDMolSupplier(path) props = [] for mol in mols: props.append(mol.GetProp(target_name)) mol_idx = list(range(len(mols))) else: raise ValueError("Unsupported file type.") print("Loaded {} Molecules from {}".format(len(mols), path)) # Optimize coordinate using multiprocessing print("Optimizing Conformers...") pool = mp.Pool(process) results = pool.starmap(optimize_conformer, zip(mol_idx, mols, props)) # Collect results mol_list, prop_list = [], [] for mol, prop in results: mol_list.append(mol) prop_list.append(prop) # Remove None and add properties mol_list_filtered = [] for mol, prop in zip(mol_list, prop_list): if mol is not None: mol.SetProp("target", str(prop)) mol_list_filtered.append(mol) print("{} Molecules Optimized".format(len(mol_list_filtered))) # Save molecules print("Saving File...") w = Chem.SDWriter(target_path) for m in mol_list_filtered: w.write(m) print("Saved {} Molecules to {}".format(len(mol_list_filtered), target_path))
Example #15
Source File: sparse_molecular_dataset.py From MolGAN with MIT License | 4 votes |
def generate(self, filename, add_h=False, filters=lambda x: True, size=None, validation=0.1, test=0.1): self.log('Extracting {}..'.format(filename)) if filename.endswith('.sdf'): self.data = list(filter(lambda x: x is not None, Chem.SDMolSupplier(filename))) elif filename.endswith('.smi'): self.data = [Chem.MolFromSmiles(line) for line in open(filename, 'r').readlines()] self.data = list(map(Chem.AddHs, self.data)) if add_h else self.data self.data = list(filter(filters, self.data)) self.data = self.data[:size] self.log('Extracted {} out of {} molecules {}adding Hydrogen!'.format(len(self.data), len(Chem.SDMolSupplier(filename)), '' if add_h else 'not ')) self._generate_encoders_decoders() self._generate_AX() # it contains the all the molecules stored as rdkit.Chem objects self.data = np.array(self.data) # it contains the all the molecules stored as SMILES strings self.smiles = np.array(self.smiles) # a (N, L) matrix where N is the length of the dataset and each L-dim vector contains the # indices corresponding to a SMILE sequences with padding wrt the max length of the longest # SMILES sequence in the dataset (see self._genS) self.data_S = np.stack(self.data_S) # a (N, 9, 9) tensor where N is the length of the dataset and each 9x9 matrix contains the # indices of the positions of the ones in the one-hot representation of the adjacency tensor # (see self._genA) self.data_A = np.stack(self.data_A) # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the # indices of the positions of the ones in the one-hot representation of the annotation matrix # (see self._genX) self.data_X = np.stack(self.data_X) # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the # diagonal of the correspondent adjacency matrix self.data_D = np.stack(self.data_D) # a (N, F) matrix where N is the length of the dataset and each F vector contains features # of the correspondent molecule (see self._genF) self.data_F = np.stack(self.data_F) # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the # eigenvalues of the correspondent Laplacian matrix self.data_Le = np.stack(self.data_Le) # a (N, 9, 9) matrix where N is the length of the dataset and each 9x9 matrix contains the # eigenvectors of the correspondent Laplacian matrix self.data_Lv = np.stack(self.data_Lv) self.vertexes = self.data_F.shape[-2] self.features = self.data_F.shape[-1] self._generate_train_validation_test(validation, test)