Python train val test split
23 Python code examples are found related to "
train val test split".
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: audio_util.py From Tensorflow-Audio-Classification with Apache License 2.0 | 8 votes |
def train_test_val_split(X, Y, split=(0.2, 0.1), shuffle=True): """Split dataset into train/val/test subsets by 70:20:10(default). Args: X: List of data. Y: List of labels corresponding to data. split: Tuple of split ratio in `test:val` order. shuffle: Bool of shuffle or not. Returns: Three dataset in `train:test:val` order. """ from sklearn.model_selection import train_test_split assert len(X) == len(Y), 'The length of X and Y must be consistent.' X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, test_size=(split[0]+split[1]), shuffle=shuffle) X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, test_size=split[1], shuffle=False) return (X_train, Y_train), (X_test, Y_test), (X_val, Y_val)
Example 2
Source File: cv.py From deep_pipe with MIT License | 6 votes |
def train_val_test_split(ids, *, val_size, n_splits, random_state=42): """ Splits the dataset's ids into triplets (train, validation, test). The test ids are determined as in the standard K-fold cross-validation setting: for each fold a different portion of 1/K ids is kept for testing. The remaining (K - 1) / K ids are split into train and validation sets according to ``val_size``. Parameters ---------- ids val_size: float, int If ``float``, should be between 0.0 and 1.0 and represents the proportion of the train set to include in the validation set. If ``int``, represents the absolute number of validation samples. n_splits: int the number of cross-validation folds. Returns ------- splits: Sequence of triplets """ split_indices = kfold_split(subj_ids=ids, n_splits=n_splits, random_state=random_state) split_indices = split_train(splits=split_indices, val_size=val_size, random_state=random_state) return indices_to_ids(split_indices, ids)
Example 3
Source File: data.py From adversarial-object-removal with MIT License | 6 votes |
def split_train_val_test(data_dir, img_size=256): df = pd.read_csv( join(data_dir, 'list_eval_partition.txt'), delim_whitespace=True, header=None ) filenames, labels = df.values[:, 0], df.values[:, 1] train_filenames = filenames[labels == 0] valid_filenames = filenames[labels == 1] test_filenames = filenames[labels == 2] train_set = Dataset( data_dir, train_filenames, input_transform_augment(178, img_size), target_transform(), target_transform_binary() ) valid_set = Dataset( data_dir, valid_filenames, input_transform(178, img_size), target_transform(), target_transform_binary() ) test_set = Dataset( data_dir, test_filenames, input_transform(178, img_size), target_transform(), target_transform_binary() ) return train_set, valid_set, test_set
Example 4
Source File: Data.py From pysster with MIT License | 5 votes |
def train_val_test_split(self, portion_train, portion_val, seed = None): """ Randomly split the data into training, validation and test set. Example: setting portion_train = 0.6 and portion_val = 0.3 will set aside 60% of the data for training, 30% for validation and the remaining 10% for testing. Use the seed parameter to get reproducible splits. Parameters ---------- portion_train: float Portion of data that should be used for training (<1.0) portion_val: float Portion of data that should be used for validation (<1.0) seed: int Seed for the random number generator. """ if seed: np.random.seed(seed) num_sequences = len(self.data) break_train = int(num_sequences * portion_train) break_val = int(num_sequences * (portion_train + portion_val)) splits = np.random.permutation(np.arange(num_sequences)) splits = np.split(splits, [break_train, break_val]) self.splits = {"train": splits[0], "val": splits[1], "test": splits[2]}
Example 5
Source File: preprocess.py From DeepLab_v3 with MIT License | 5 votes |
def train_val_test_split(dataset_filenames, split_ratios, train_dataset_filename, valid_dataset_filename, test_dataset_filename): ''' Split dataset into train, valid, and test datasets dataset_filenames: a list of image filenames split_ratios: [train_dataset_ratio, valid_dataset_ratio, test_dataset_ratio], e.g., [0.7, 0.2, 0.1] train_dataset_filename: path of txt file to save the filenames of train data valid_dataset_filename: path of txt file to save the filenames of valid data test_dataset_filename: path of txt file to save the filenames of test data ''' assert len(split_ratios) == 3 and abs(sum(split_ratios) - 1) < 1e-5, 'Please use all the data.' dataset_filenames = np.asarray(dataset_filenames) idx = np.arange(len(dataset_filenames)) np.random.shuffle(idx) train_split_idx = int(len(dataset_filenames) * split_ratios[0]) valid_split_idx = int(len(dataset_filenames) * (split_ratios[0] + split_ratios[1])) train_idx = idx[:train_split_idx] valid_idx = idx[train_split_idx:valid_split_idx] test_idx = idx[valid_split_idx:] train_filenames = dataset_filenames[train_idx] valid_filenames = dataset_filenames[valid_idx] test_filenames = dataset_filenames[test_idx] with open(train_dataset_filename, 'w') as file: file.write('\n'.join(train_filenames)) with open(valid_dataset_filename, 'w') as file: file.write('\n'.join(valid_filenames)) with open(test_dataset_filename, 'w') as file: file.write('\n'.join(test_filenames))
Example 6
Source File: data_prep.py From Deep-Learning-Quick-Reference with MIT License | 5 votes |
def train_val_test_split(df, val_pct=0.1, test_pct=0.1): size = df.shape[0] val_pct = (val_pct * size) / (size * (1 - test_pct)) train_val, test = train_test_split(df, test_size=test_pct) train, val = train_test_split(train_val, test_size=val_pct) return train, val, test
Example 7
Source File: Train_Val_Test_spliter.py From ViolenceDetection with Apache License 2.0 | 5 votes |
def Split_Train_Val_Test_Data(LIST_OF_VIDEOS_): random.shuffle(LIST_OF_VIDEOS_) NUMBER_OF_TOTAL_DATA = len(LIST_OF_VIDEOS_) NUMBER_OF_TEST_VIDEOS = int(NUMBER_OF_TOTAL_DATA * TEST_SET_RATIO) NUMBER_OF_VAL_VIDEOS = int(NUMBER_OF_TOTAL_DATA * VAL_SET_RATIO) listOfTestVideos = LIST_OF_VIDEOS_[ : NUMBER_OF_TEST_VIDEOS] listOfValVideos = LIST_OF_VIDEOS_[NUMBER_OF_TEST_VIDEOS : (NUMBER_OF_TEST_VIDEOS+NUMBER_OF_VAL_VIDEOS)] listOfTrainVideos = LIST_OF_VIDEOS_[(NUMBER_OF_TEST_VIDEOS+NUMBER_OF_VAL_VIDEOS) : ] return listOfTrainVideos, listOfValVideos, listOfTestVideos
Example 8
Source File: app.py From edge_detection_framework with MIT License | 5 votes |
def train_val_test_split(id_lists, train_fraction, val_fraction, test_fraction): train_ids = [] val_ids = [] test_ids = [] for dataset_idx, id_list in enumerate(id_lists): print('dataset', dataset_idx, 'contains', len(id_lists), 'items.') train, val, test = make_splits(id_list, [train_fraction, val_fraction, test_fraction]) train_ids += train val_ids += val test_ids += test print('train_ids', len(train_ids), 'val_ids', len(val_ids), 'test_ids', len(test_ids)) return {'train': train_ids, 'valid': val_ids, 'test': test_ids}
Example 9
Source File: process_vad_data.py From NeMo with Apache License 2.0 | 5 votes |
def split_train_val_test(data_dir, file_type, test_size=0.1, val_size=0.1): X = [] if file_type == "speech": for o in os.listdir(data_dir): if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_": X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav')) else: for o in os.listdir(data_dir): if os.path.isdir(os.path.join(data_dir, o)): X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav')) else: # for using "_background_noise_" from google speech commands as background data if o.endswith(".wav"): X.append(os.path.join(data_dir, o)) X_train, X_test = train_test_split(X, test_size=test_size, random_state=1) val_size_tmp = val_size / (1 - test_size) X_train, X_val = train_test_split(X_train, test_size=val_size_tmp, random_state=1) with open(os.path.join(data_dir, file_type + "_training_list.txt"), "w") as outfile: outfile.write("\n".join(X_train)) with open(os.path.join(data_dir, file_type + "_testing_list.txt"), "w") as outfile: outfile.write("\n".join(X_test)) with open(os.path.join(data_dir, file_type + "_validation_list.txt"), "w") as outfile: outfile.write("\n".join(X_val)) logging.info(f'Overall: {len(X)}, Train: {len(X_train)}, Validatoin: {len(X_val)}, Test: {len(X_test)}') logging.info(f"Finished split train, val and test for {file_type}. Write to files!")
Example 10
Source File: preprocessing_orb.py From Indian-Sign-Language-Recognition with MIT License | 5 votes |
def train_test_val_split_idxs(total_rows, percent_test, percent_val): """ Get indexes for training, test, and validation rows, given a total number of rows. Assumes indexes are sequential integers starting at 0: eg [0,1,2,3,...N] Returns: -------- training_idxs, test_idxs, val_idxs Both lists of integers """ if percent_test + percent_val >= 1.0: raise ValueError('percent_test and percent_val must sum to less than 1.0') row_range = range(total_rows) no_test_rows = int(total_rows*(percent_test)) test_idxs = np.random.choice(row_range, size=no_test_rows, replace=False) # remove test indexes row_range = [idx for idx in row_range if idx not in test_idxs] no_val_rows = int(total_rows*(percent_val)) val_idxs = np.random.choice(row_range, size=no_val_rows, replace=False) # remove validation indexes training_idxs = [idx for idx in row_range if idx not in val_idxs] print('Train-test-val split: %i training rows, %i test rows, %i validation rows' % (len(training_idxs), len(test_idxs), len(val_idxs))) return training_idxs, test_idxs, val_idxs
Example 11
Source File: usermanager.py From tatk with Apache License 2.0 | 5 votes |
def train_test_val_split(goals, usr_dass, sys_dass, test_size=0.1, val_size=0.1): idx = range(len(goals)) idx_test = random.sample(idx, int(len(goals) * test_size)) idx_train = list(set(idx) - set(idx_test)) idx_val = random.sample(idx_train, int(len(goals) * val_size)) idx_train = list(set(idx_train) - set(idx_val)) idx_train = random.sample(idx_train, len(idx_train)) return np.array(goals)[idx_train], np.array(usr_dass)[idx_train], np.array(sys_dass)[idx_train], \ np.array(goals)[idx_test], np.array(usr_dass)[idx_test], np.array(sys_dass)[idx_test], \ np.array(goals)[idx_val], np.array(usr_dass)[idx_val], np.array(sys_dass)[idx_val]
Example 12
Source File: usermanager.py From tatk with Apache License 2.0 | 5 votes |
def train_test_val_split_seg(goals_seg, usr_dass_seg, sys_dass_seg, test_size=0.1, val_size=0.1): def dr(dss): return np.array([d for ds in dss for d in ds]) idx = range(len(goals_seg)) idx_test = random.sample(idx, int(len(goals_seg) * test_size)) idx_train = list(set(idx) - set(idx_test)) idx_val = random.sample(idx_train, int(len(goals_seg) * val_size)) idx_train = list(set(idx_train) - set(idx_val)) idx_train = random.sample(idx_train, len(idx_train)) return dr(np.array(goals_seg)[idx_train]), dr(np.array(usr_dass_seg)[idx_train]), dr(np.array(sys_dass_seg)[idx_train]), \ dr(np.array(goals_seg)[idx_test]), dr(np.array(usr_dass_seg)[idx_test]), dr(np.array(sys_dass_seg)[idx_test]), \ dr(np.array(goals_seg)[idx_val]), dr(np.array(usr_dass_seg)[idx_val]), dr(np.array(sys_dass_seg)[idx_val])
Example 13
Source File: data.py From gdc with MIT License | 5 votes |
def set_train_val_test_split( seed: int, data: Data, num_development: int = 1500, num_per_class: int = 20) -> Data: rnd_state = np.random.RandomState(development_seed) num_nodes = data.y.shape[0] development_idx = rnd_state.choice(num_nodes, num_development, replace=False) test_idx = [i for i in np.arange(num_nodes) if i not in development_idx] train_idx = [] rnd_state = np.random.RandomState(seed) for c in range(data.y.max() + 1): class_idx = development_idx[np.where(data.y[development_idx].cpu() == c)[0]] train_idx.extend(rnd_state.choice(class_idx, num_per_class, replace=False)) val_idx = [i for i in development_idx if i not in train_idx] def get_mask(idx): mask = torch.zeros(num_nodes, dtype=torch.bool) mask[idx] = 1 return mask data.train_mask = get_mask(train_idx) data.val_mask = get_mask(val_idx) data.test_mask = get_mask(test_idx) return data
Example 14
Source File: preprocess_data.py From deep-regex with MIT License | 5 votes |
def split_train_test_val(ar, ratio): train_set = ar[:int(len(ar)*ratio)] not_train_set = ar[int(len(ar)*ratio):] val_set = not_train_set[int(len(not_train_set)*(5.0/7.0)):] test_set = not_train_set[:int(len(not_train_set)*(5.0/7.0))] return train_set, val_set, test_set
Example 15
Source File: sentence_tokenizer.py From neural_chat with MIT License | 4 votes |
def split_train_val_test(self, sentences, info_dicts, split_parameter=[0.7, 0.1, 0.2], extend_with=0): """ Splits given sentences into three different datasets: training, validation and testing. # Arguments: sentences: The sentences to be tokenized. info_dicts: A list of dicts that contain information about each sentence (e.g. a label). split_parameter: A parameter for deciding the splits between the three different datasets. If instead of being passed three values, three lists are passed, then these will be used to specify which observation belong to which dataset. extend_with: An optional parameter. If > 0 then this is the number of tokens added to the vocabulary from this dataset. The expanded vocab will be generated using only the training set, but is applied to all three sets. # Returns: List of three lists of tokenized sentences, List of three corresponding dictionaries with information, How many tokens have been added to the vocab. Make sure to extend the embedding layer of the model accordingly. """ # If passed three lists, use those directly if isinstance(split_parameter, list) and \ all(isinstance(x, list) for x in split_parameter) and \ len(split_parameter) == 3: # Helper function to verify provided indices are numbers in range def verify_indices(inds): return list(filter(lambda i: isinstance(i, numbers.Number) and i < len(sentences), inds)) ind_train = verify_indices(split_parameter[0]) ind_val = verify_indices(split_parameter[1]) ind_test = verify_indices(split_parameter[2]) else: # Split sentences and dicts ind = list(range(len(sentences))) ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2]) ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) # Map indices to data train = np.array([sentences[x] for x in ind_train]) test = np.array([sentences[x] for x in ind_test]) val = np.array([sentences[x] for x in ind_val]) info_train = np.array([info_dicts[x] for x in ind_train]) info_test = np.array([info_dicts[x] for x in ind_test]) info_val = np.array([info_dicts[x] for x in ind_val]) added = 0 # Extend vocabulary with training set tokens if extend_with > 0: wg = WordGenerator(train) vb = VocabBuilder(wg) vb.count_all_words() added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) # Wrap results result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] result_infos = [info_train, info_val, info_test] # if type(result_infos[0][0]) in [np.double, np.float, np.int64, np.int32, np.uint8]: # result_infos = [torch.from_numpy(label).long() for label in result_infos] return result, result_infos, added
Example 16
Source File: make_dataset.py From gnn-benchmark with MIT License | 4 votes |
def get_train_val_test_split(random_state, labels, train_examples_per_class=None, val_examples_per_class=None, test_examples_per_class=None, train_size=None, val_size=None, test_size=None): num_samples, num_classes = labels.shape remaining_indices = list(range(num_samples)) if train_examples_per_class is not None: train_indices = sample_per_class(random_state, labels, train_examples_per_class) else: # select train examples with no respect to class distribution train_indices = random_state.choice(remaining_indices, train_size, replace=False) if val_examples_per_class is not None: val_indices = sample_per_class(random_state, labels, val_examples_per_class, forbidden_indices=train_indices) else: remaining_indices = np.setdiff1d(remaining_indices, train_indices) val_indices = random_state.choice(remaining_indices, val_size, replace=False) forbidden_indices = np.concatenate((train_indices, val_indices)) if test_examples_per_class is not None: test_indices = sample_per_class(random_state, labels, test_examples_per_class, forbidden_indices=forbidden_indices) elif test_size is not None: remaining_indices = np.setdiff1d(remaining_indices, forbidden_indices) test_indices = random_state.choice(remaining_indices, test_size, replace=False) else: test_indices = np.setdiff1d(remaining_indices, forbidden_indices) # assert that there are no duplicates in sets assert len(set(train_indices)) == len(train_indices) assert len(set(val_indices)) == len(val_indices) assert len(set(test_indices)) == len(test_indices) # assert sets are mutually exclusive assert len(set(train_indices) - set(val_indices)) == len(set(train_indices)) assert len(set(train_indices) - set(test_indices)) == len(set(train_indices)) assert len(set(val_indices) - set(test_indices)) == len(set(val_indices)) if test_size is None and test_examples_per_class is None: # all indices must be part of the split assert len(np.concatenate((train_indices, val_indices, test_indices))) == num_samples if train_examples_per_class is not None: train_labels = labels[train_indices, :] train_sum = np.sum(train_labels, axis=0) # assert all classes have equal cardinality assert np.unique(train_sum).size == 1 if val_examples_per_class is not None: val_labels = labels[val_indices, :] val_sum = np.sum(val_labels, axis=0) # assert all classes have equal cardinality assert np.unique(val_sum).size == 1 if test_examples_per_class is not None: test_labels = labels[test_indices, :] test_sum = np.sum(test_labels, axis=0) # assert all classes have equal cardinality assert np.unique(test_sum).size == 1 return train_indices, val_indices, test_indices
Example 17
Source File: sentence_tokenizer.py From DeepMoji with MIT License | 4 votes |
def split_train_val_test(self, sentences, info_dicts, split_parameter=[0.7, 0.1, 0.2], extend_with=0): """ Splits given sentences into three different datasets: training, validation and testing. # Arguments: sentences: The sentences to be tokenized. info_dicts: A list of dicts that contain information about each sentence (e.g. a label). split_parameter: A parameter for deciding the splits between the three different datasets. If instead of being passed three values, three lists are passed, then these will be used to specify which observation belong to which dataset. extend_with: An optional parameter. If > 0 then this is the number of tokens added to the vocabulary from this dataset. The expanded vocab will be generated using only the training set, but is applied to all three sets. # Returns: List of three lists of tokenized sentences, List of three corresponding dictionaries with information, How many tokens have been added to the vocab. Make sure to extend the embedding layer of the model accordingly. """ # If passed three lists, use those directly if isinstance(split_parameter, list) and \ all(isinstance(x, list) for x in split_parameter) and \ len(split_parameter) == 3: # Helper function to verify provided indices are numbers in range def verify_indices(inds): return list(filter(lambda i: isinstance(i, numbers.Number) and i < len(sentences), inds)) ind_train = verify_indices(split_parameter[0]) ind_val = verify_indices(split_parameter[1]) ind_test = verify_indices(split_parameter[2]) else: # Split sentences and dicts ind = list(range(len(sentences))) ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2]) ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1]) # Map indices to data train = np.array([sentences[x] for x in ind_train]) test = np.array([sentences[x] for x in ind_test]) val = np.array([sentences[x] for x in ind_val]) info_train = np.array([info_dicts[x] for x in ind_train]) info_test = np.array([info_dicts[x] for x in ind_test]) info_val = np.array([info_dicts[x] for x in ind_val]) added = 0 # Extend vocabulary with training set tokens if extend_with > 0: wg = WordGenerator(train) vb = VocabBuilder(wg) vb.count_all_words() added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with) # Wrap results result = [self.tokenize_sentences(s)[0] for s in [train, val, test]] result_infos = [info_train, info_val, info_test] return result, result_infos, added
Example 18
Source File: stability_serializer.py From tape-neurips2019 with MIT License | 4 votes |
def make_train_val_test_split(rd1, rd2, rd3, rd4, ssm2): # any rd4 sequence derived from any other previous sequence is out # EEHEE_rd1_0001.pdb # EEHEE_rd1_0001.pdb_hp # EEHEE_rd1_0001.pdb_random # other modifiers include # '_PG_hp' # '_buryD' # '_PG_hp_prottest_XXX' base_name = rd1['name'].str.split('.', n=1, expand=True) rd1['base_name'] = base_name[0] topology = rd1['base_name'].str.split('_', n=1, expand=True) rd1['topology'] = topology[0] base_name = rd2['name'].str.split('.', n=1, expand=True) rd2['base_name'] = base_name[0] topology = rd2['base_name'].str.split('_', n=1, expand=True) rd2['topology'] = topology[0] base_name = rd3['name'].str.split('.', n=1, expand=True) rd3['base_name'] = base_name[0] topology = rd3['base_name'].str.split('_', n=1, expand=True) rd3['topology'] = topology[0] base_name = rd4['name'].str.split('.', n=1, expand=True) rd4['base_name'] = base_name[0] topology = rd4['base_name'].str.split('_', n=1, expand=True) rd4['topology'] = topology[0] base_name = ssm2['name'].str.split('.', n=1, expand=True) ssm2['base_name'] = base_name[0] topology = ssm2['base_name'].str.split('_', n=1, expand=True) ssm2['topology'] = topology[0] # need to filter out all sequences from val based on the original ones... all_base = list(rd1.base_name.values) all_base.extend(rd2.base_name.values) all_base.extend(rd3.base_name.values) train = rd1 train = train.append(rd2) train = train.append(rd3) # filter 1552 sequences that appear in training already train = train.append(rd4[rd4['base_name'].isin(all_base)]) # 18145 remaining val_set = rd4[~rd4['base_name'].isin(all_base)] validation = pd.DataFrame() for topology in designed_topologies: top_set = val_set[val_set['topology'] == topology] # pick 200 base sequences for val base_seqs = np.random.choice(top_set.base_name.values, size=200) # use the base sequences + controls (buryD, PG_hp) for validation ~500 val_for_topology = top_set[top_set['base_name'].isin(base_seqs)] validation = validation.append(val_for_topology) print('validation for topology {}'.format(topology)) print(val_for_topology.shape[0]) to_train = top_set[~top_set['base_name'].isin(base_seqs)] print(to_train.shape[0]) train = train.append(to_train) # 5k more to train on that are not part of the designed topologies train = train.append(val_set[~val_set['topology'].isin(designed_topologies)]) test = ssm2 return train, validation, test
Example 19
Source File: train_test_split_utils.py From deep-molecular-massspec with Apache License 2.0 | 4 votes |
def make_train_val_test_split_inchikey_lists(train_inchikey_list, train_inchikey_dict, train_val_test_split_fractions, holdout_inchikey_list=None, splitting_type='random'): """Given inchikey lists, returns lists to use for train/val/test sets. If holdout_inchikey_list is given, the inchikeys in this list will be excluded from the returned train/validation/test lists. Args: train_inchikey_list : List of inchikeys to use for train/val/test sets train_inchikey_dict : Main dict keyed by inchikeys, values are lists of rdkit.Mol. Note that train_inchikey_dict.keys() != train_inchikey_list train_inchikey_dict will have many more keys than are in the list. train_val_test_split_fractions : a TrainValTestFractions tuple holdout_inchikey_list : List of inchikeys to exclude from train/val/test sets. splitting_type : method of splitting molecules into train/val/test sets. Returns: A TrainValTestInchikeys namedtuple Raises: ValueError : if not train_val_test_split_sizes XOR train_val_test_split_fractions or if specify a splitting_type that isn't implemented yet. """ if not np.isclose([sum(train_val_test_split_fractions)], [1.0]): raise ValueError('Must specify train_val_test_split that sums to 1.0') if holdout_inchikey_list: # filter out those inchikeys that are in the holdout set. train_inchikey_list = [ ikey for ikey in train_inchikey_list if ikey not in holdout_inchikey_list ] if splitting_type == 'random': return get_random_inchikeys(train_inchikey_list, train_val_test_split_fractions) else: # Assume that splitting_type is the name of a structure family. # get_inchikeys_by_family will throw an error if this is not supported. return get_inchikeys_by_family( train_inchikey_list, train_inchikey_dict, train_val_test_split_fractions, family_name=splitting_type, exclude_from_train=True)
Example 20
Source File: utils.py From gnn-meta-attack with MIT License | 4 votes |
def train_val_test_split_tabular(*arrays, train_size=0.5, val_size=0.3, test_size=0.2, stratify=None, random_state=None): """ Split the arrays or matrices into random train, validation and test subsets. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays or scipy-sparse matrices. train_size : float, default 0.5 Proportion of the dataset included in the train split. val_size : float, default 0.3 Proportion of the dataset included in the validation split. test_size : float, default 0.2 Proportion of the dataset included in the test split. stratify : array-like or None, default None If not None, data is split in a stratified fashion, using this as the class labels. random_state : int or None, default None Random_state is the seed used by the random number generator; Returns ------- splitting : list, length=3 * len(arrays) List containing train-validation-test split of inputs. """ if len(set(array.shape[0] for array in arrays)) != 1: raise ValueError("Arrays must have equal first dimension.") idx = np.arange(arrays[0].shape[0]) idx_train_and_val, idx_test = train_test_split(idx, random_state=random_state, train_size=(train_size + val_size), test_size=test_size, stratify=stratify) if stratify is not None: stratify = stratify[idx_train_and_val] idx_train, idx_val = train_test_split(idx_train_and_val, random_state=random_state, train_size=(train_size / (train_size + val_size)), test_size=(val_size / (train_size + val_size)), stratify=stratify) result = [] for X in arrays: result.append(X[idx_train]) result.append(X[idx_val]) result.append(X[idx_test]) return result
Example 21
Source File: utils.py From nettack with MIT License | 4 votes |
def train_val_test_split_tabular(*arrays, train_size=0.5, val_size=0.3, test_size=0.2, stratify=None, random_state=None): """ Split the arrays or matrices into random train, validation and test subsets. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays or scipy-sparse matrices. train_size : float, default 0.5 Proportion of the dataset included in the train split. val_size : float, default 0.3 Proportion of the dataset included in the validation split. test_size : float, default 0.2 Proportion of the dataset included in the test split. stratify : array-like or None, default None If not None, data is split in a stratified fashion, using this as the class labels. random_state : int or None, default None Random_state is the seed used by the random number generator; Returns ------- splitting : list, length=3 * len(arrays) List containing train-validation-test split of inputs. """ if len(set(array.shape[0] for array in arrays)) != 1: raise ValueError("Arrays must have equal first dimension.") idx = np.arange(arrays[0].shape[0]) idx_train_and_val, idx_test = train_test_split(idx, random_state=random_state, train_size=(train_size + val_size), test_size=test_size, stratify=stratify) if stratify is not None: stratify = stratify[idx_train_and_val] idx_train, idx_val = train_test_split(idx_train_and_val, random_state=random_state, train_size=(train_size / (train_size + val_size)), test_size=(val_size / (train_size + val_size)), stratify=stratify) result = [] for X in arrays: result.append(X[idx_train]) result.append(X[idx_val]) result.append(X[idx_test]) return result