Python train val test split

Source File: audio_util.py From Tensorflow-Audio-Classification with Apache License 2.0

8 votes

def train_test_val_split(X, Y, split=(0.2, 0.1), shuffle=True):
    """Split dataset into train/val/test subsets by 70:20:10(default).
    
    Args:
      X: List of data.
      Y: List of labels corresponding to data.
      split: Tuple of split ratio in `test:val` order.
      shuffle: Bool of shuffle or not.
      
    Returns:
      Three dataset in `train:test:val` order.
    """
    from sklearn.model_selection import train_test_split
    assert len(X) == len(Y), 'The length of X and Y must be consistent.'
    X_train, X_test_val, Y_train, Y_test_val = train_test_split(X, Y, 
        test_size=(split[0]+split[1]), shuffle=shuffle)
    X_test, X_val, Y_test, Y_val = train_test_split(X_test_val, Y_test_val, 
        test_size=split[1], shuffle=False)
    return (X_train, Y_train), (X_test, Y_test), (X_val, Y_val)

Source File: cv.py From deep_pipe with MIT License

6 votes

def train_val_test_split(ids, *, val_size, n_splits, random_state=42):
    """
    Splits the dataset's ids into triplets (train, validation, test).
    The test ids are determined as in the standard K-fold cross-validation setting:
    for each fold a different portion of 1/K ids is kept for testing.
    The remaining (K - 1) / K ids are split into train and validation sets according to ``val_size``.

    Parameters
    ----------
    ids
    val_size: float, int
        If ``float``, should be between 0.0 and 1.0 and represents the proportion
        of the train set to include in the validation set. If ``int``, represents the
        absolute number of validation samples.
    n_splits: int
        the number of cross-validation folds.

    Returns
    -------
    splits: Sequence of triplets
    """
    split_indices = kfold_split(subj_ids=ids, n_splits=n_splits, random_state=random_state)
    split_indices = split_train(splits=split_indices, val_size=val_size, random_state=random_state)
    return indices_to_ids(split_indices, ids)

Source File: data.py From adversarial-object-removal with MIT License

6 votes

def split_train_val_test(data_dir, img_size=256):
    df = pd.read_csv(
        join(data_dir, 'list_eval_partition.txt'),
        delim_whitespace=True, header=None
    )
    filenames, labels = df.values[:, 0], df.values[:, 1]

    train_filenames = filenames[labels == 0]
    valid_filenames = filenames[labels == 1]
    test_filenames  = filenames[labels == 2]

    train_set = Dataset(
        data_dir, train_filenames, input_transform_augment(178, img_size),
        target_transform(), target_transform_binary()
    )
    valid_set = Dataset(
        data_dir, valid_filenames, input_transform(178, img_size),
        target_transform(), target_transform_binary()
    )
    test_set = Dataset(
        data_dir, test_filenames, input_transform(178, img_size),
        target_transform(), target_transform_binary()
    )

    return train_set, valid_set, test_set

Source File: Data.py From pysster with MIT License

5 votes

def train_val_test_split(self, portion_train, portion_val, seed = None):
        """ Randomly split the data into training, validation and test set.

        Example: setting portion_train = 0.6 and portion_val = 0.3 will set aside 60% of the data
        for training, 30% for validation and the remaining 10% for testing. Use the seed parameter
        to get reproducible splits.
        
        Parameters
        ----------
        portion_train: float
            Portion of data that should be used for training (<1.0) 
        
        portion_val: float
            Portion of data that should be used for validation (<1.0)
        
        seed: int
            Seed for the random number generator.
        """
        if seed:
            np.random.seed(seed)
        num_sequences = len(self.data)
        break_train = int(num_sequences * portion_train)
        break_val = int(num_sequences * (portion_train + portion_val))
        splits = np.random.permutation(np.arange(num_sequences))
        splits = np.split(splits, [break_train, break_val])
        self.splits = {"train": splits[0], "val": splits[1], "test": splits[2]}

Source File: preprocess.py From DeepLab_v3 with MIT License

5 votes

def train_val_test_split(dataset_filenames, split_ratios, train_dataset_filename, valid_dataset_filename, test_dataset_filename):
    '''
    Split dataset into train, valid, and test datasets
    dataset_filenames: a list of image filenames
    split_ratios: [train_dataset_ratio, valid_dataset_ratio, test_dataset_ratio], e.g., [0.7, 0.2, 0.1]
    train_dataset_filename: path of txt file to save the filenames of train data
    valid_dataset_filename: path of txt file to save the filenames of valid data
    test_dataset_filename: path of txt file to save the filenames of test data
    '''

    assert len(split_ratios) == 3 and abs(sum(split_ratios) - 1) < 1e-5, 'Please use all the data.'

    dataset_filenames = np.asarray(dataset_filenames)
    idx = np.arange(len(dataset_filenames))
    np.random.shuffle(idx)
    train_split_idx = int(len(dataset_filenames) * split_ratios[0])
    valid_split_idx = int(len(dataset_filenames) * (split_ratios[0] + split_ratios[1]))

    train_idx = idx[:train_split_idx]
    valid_idx = idx[train_split_idx:valid_split_idx]
    test_idx = idx[valid_split_idx:]

    train_filenames = dataset_filenames[train_idx]
    valid_filenames = dataset_filenames[valid_idx]
    test_filenames = dataset_filenames[test_idx]

    with open(train_dataset_filename, 'w') as file:
        file.write('\n'.join(train_filenames))
    with open(valid_dataset_filename, 'w') as file:
        file.write('\n'.join(valid_filenames))
    with open(test_dataset_filename, 'w') as file:
        file.write('\n'.join(test_filenames))

Source File: data_prep.py From Deep-Learning-Quick-Reference with MIT License

5 votes

def train_val_test_split(df, val_pct=0.1, test_pct=0.1):
    size = df.shape[0]
    val_pct = (val_pct * size) / (size * (1 - test_pct))
    train_val, test = train_test_split(df, test_size=test_pct)
    train, val = train_test_split(train_val, test_size=val_pct)
    return train, val, test

Source File: Train_Val_Test_spliter.py From ViolenceDetection with Apache License 2.0

5 votes

def Split_Train_Val_Test_Data(LIST_OF_VIDEOS_):
	random.shuffle(LIST_OF_VIDEOS_)
	NUMBER_OF_TOTAL_DATA = len(LIST_OF_VIDEOS_)
	NUMBER_OF_TEST_VIDEOS = int(NUMBER_OF_TOTAL_DATA * TEST_SET_RATIO)
	NUMBER_OF_VAL_VIDEOS = int(NUMBER_OF_TOTAL_DATA * VAL_SET_RATIO)

	listOfTestVideos = LIST_OF_VIDEOS_[ : NUMBER_OF_TEST_VIDEOS]
	listOfValVideos = LIST_OF_VIDEOS_[NUMBER_OF_TEST_VIDEOS : (NUMBER_OF_TEST_VIDEOS+NUMBER_OF_VAL_VIDEOS)]
	listOfTrainVideos = LIST_OF_VIDEOS_[(NUMBER_OF_TEST_VIDEOS+NUMBER_OF_VAL_VIDEOS) : ]

	return listOfTrainVideos, listOfValVideos, listOfTestVideos

Source File: app.py From edge_detection_framework with MIT License

5 votes

def train_val_test_split(id_lists, train_fraction, val_fraction, test_fraction):
    train_ids = []
    val_ids = []
    test_ids = []

    for dataset_idx, id_list in enumerate(id_lists):
        print('dataset', dataset_idx, 'contains', len(id_lists), 'items.')
        train, val, test = make_splits(id_list, [train_fraction, val_fraction, test_fraction])
        train_ids += train
        val_ids += val
        test_ids += test
        print('train_ids', len(train_ids), 'val_ids', len(val_ids), 'test_ids', len(test_ids))

    return {'train': train_ids, 'valid': val_ids, 'test': test_ids}

Source File: process_vad_data.py From NeMo with Apache License 2.0

5 votes

def split_train_val_test(data_dir, file_type, test_size=0.1, val_size=0.1):
    X = []
    if file_type == "speech":
        for o in os.listdir(data_dir):
            if os.path.isdir(os.path.join(data_dir, o)) and o.split("/")[-1] != "_background_noise_":
                X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav'))
    else:
        for o in os.listdir(data_dir):
            if os.path.isdir(os.path.join(data_dir, o)):
                X.extend(glob.glob(os.path.join(data_dir, o) + '/*.wav'))
            else:  # for using "_background_noise_" from google speech commands as background data
                if o.endswith(".wav"):
                    X.append(os.path.join(data_dir, o))

    X_train, X_test = train_test_split(X, test_size=test_size, random_state=1)
    val_size_tmp = val_size / (1 - test_size)
    X_train, X_val = train_test_split(X_train, test_size=val_size_tmp, random_state=1)

    with open(os.path.join(data_dir, file_type + "_training_list.txt"), "w") as outfile:
        outfile.write("\n".join(X_train))
    with open(os.path.join(data_dir, file_type + "_testing_list.txt"), "w") as outfile:
        outfile.write("\n".join(X_test))
    with open(os.path.join(data_dir, file_type + "_validation_list.txt"), "w") as outfile:
        outfile.write("\n".join(X_val))

    logging.info(f'Overall: {len(X)}, Train: {len(X_train)}, Validatoin: {len(X_val)}, Test: {len(X_test)}')
    logging.info(f"Finished split train, val and test for {file_type}. Write to files!")

Source File: preprocessing_orb.py From Indian-Sign-Language-Recognition with MIT License

5 votes

def train_test_val_split_idxs(total_rows, percent_test, percent_val):
    """
    Get indexes for training, test, and validation rows, given a total number of rows.
    Assumes indexes are sequential integers starting at 0: eg [0,1,2,3,...N]
    Returns:
    --------
    training_idxs, test_idxs, val_idxs
        Both lists of integers
    """
    if percent_test + percent_val >= 1.0:
        raise ValueError('percent_test and percent_val must sum to less than 1.0')

    row_range = range(total_rows)

    no_test_rows = int(total_rows*(percent_test))
    test_idxs = np.random.choice(row_range, size=no_test_rows, replace=False)
    # remove test indexes
    row_range = [idx for idx in row_range if idx not in test_idxs]

    no_val_rows = int(total_rows*(percent_val))
    val_idxs = np.random.choice(row_range, size=no_val_rows, replace=False)
    # remove validation indexes
    training_idxs = [idx for idx in row_range if idx not in val_idxs]

    print('Train-test-val split: %i training rows, %i test rows, %i validation rows' % (len(training_idxs), len(test_idxs), len(val_idxs)))

    return training_idxs, test_idxs, val_idxs

Source File: usermanager.py From tatk with Apache License 2.0

5 votes

def train_test_val_split(goals, usr_dass, sys_dass, test_size=0.1, val_size=0.1):
        idx = range(len(goals))
        idx_test = random.sample(idx, int(len(goals) * test_size))
        idx_train = list(set(idx) - set(idx_test))
        idx_val = random.sample(idx_train, int(len(goals) * val_size))
        idx_train = list(set(idx_train) - set(idx_val))
        idx_train = random.sample(idx_train, len(idx_train))
        return np.array(goals)[idx_train], np.array(usr_dass)[idx_train], np.array(sys_dass)[idx_train], \
               np.array(goals)[idx_test], np.array(usr_dass)[idx_test], np.array(sys_dass)[idx_test], \
               np.array(goals)[idx_val], np.array(usr_dass)[idx_val], np.array(sys_dass)[idx_val]

Source File: usermanager.py From tatk with Apache License 2.0

5 votes

def train_test_val_split_seg(goals_seg, usr_dass_seg, sys_dass_seg, test_size=0.1, val_size=0.1):
        def dr(dss):
            return np.array([d for ds in dss for d in ds])

        idx = range(len(goals_seg))
        idx_test = random.sample(idx, int(len(goals_seg) * test_size))
        idx_train = list(set(idx) - set(idx_test))
        idx_val = random.sample(idx_train, int(len(goals_seg) * val_size))
        idx_train = list(set(idx_train) - set(idx_val))
        idx_train = random.sample(idx_train, len(idx_train))
        return dr(np.array(goals_seg)[idx_train]), dr(np.array(usr_dass_seg)[idx_train]), dr(np.array(sys_dass_seg)[idx_train]), \
               dr(np.array(goals_seg)[idx_test]), dr(np.array(usr_dass_seg)[idx_test]), dr(np.array(sys_dass_seg)[idx_test]), \
               dr(np.array(goals_seg)[idx_val]), dr(np.array(usr_dass_seg)[idx_val]), dr(np.array(sys_dass_seg)[idx_val])

Source File: data.py From gdc with MIT License

5 votes

def set_train_val_test_split(
        seed: int,
        data: Data,
        num_development: int = 1500,
        num_per_class: int = 20) -> Data:
    rnd_state = np.random.RandomState(development_seed)
    num_nodes = data.y.shape[0]
    development_idx = rnd_state.choice(num_nodes, num_development, replace=False)
    test_idx = [i for i in np.arange(num_nodes) if i not in development_idx]

    train_idx = []
    rnd_state = np.random.RandomState(seed)
    for c in range(data.y.max() + 1):
        class_idx = development_idx[np.where(data.y[development_idx].cpu() == c)[0]]
        train_idx.extend(rnd_state.choice(class_idx, num_per_class, replace=False))

    val_idx = [i for i in development_idx if i not in train_idx]

    def get_mask(idx):
        mask = torch.zeros(num_nodes, dtype=torch.bool)
        mask[idx] = 1
        return mask

    data.train_mask = get_mask(train_idx)
    data.val_mask = get_mask(val_idx)
    data.test_mask = get_mask(test_idx)

    return data

Source File: preprocess_data.py From deep-regex with MIT License

5 votes

def split_train_test_val(ar, ratio):
    train_set = ar[:int(len(ar)*ratio)]
    not_train_set = ar[int(len(ar)*ratio):]
    val_set = not_train_set[int(len(not_train_set)*(5.0/7.0)):]
    test_set = not_train_set[:int(len(not_train_set)*(5.0/7.0))]

    return train_set, val_set, test_set

Source File: sentence_tokenizer.py From neural_chat with MIT License

4 votes

def split_train_val_test(self, sentences, info_dicts,
                             split_parameter=[0.7, 0.1, 0.2], extend_with=0):
        """ Splits given sentences into three different datasets: training,
            validation and testing.

        # Arguments:
            sentences: The sentences to be tokenized.
            info_dicts: A list of dicts that contain information about each
                sentence (e.g. a label).
            split_parameter: A parameter for deciding the splits between the
                three different datasets. If instead of being passed three
                values, three lists are passed, then these will be used to
                specify which observation belong to which dataset.
            extend_with: An optional parameter. If > 0 then this is the number
                of tokens added to the vocabulary from this dataset. The
                expanded vocab will be generated using only the training set,
                but is applied to all three sets.

        # Returns:
            List of three lists of tokenized sentences,

            List of three corresponding dictionaries with information,

            How many tokens have been added to the vocab. Make sure to extend
            the embedding layer of the model accordingly.
        """

        # If passed three lists, use those directly
        if isinstance(split_parameter, list) and \
                all(isinstance(x, list) for x in split_parameter) and \
                len(split_parameter) == 3:

            # Helper function to verify provided indices are numbers in range
            def verify_indices(inds):
                return list(filter(lambda i: isinstance(i, numbers.Number)
                            and i < len(sentences), inds))

            ind_train = verify_indices(split_parameter[0])
            ind_val = verify_indices(split_parameter[1])
            ind_test = verify_indices(split_parameter[2])
        else:
            # Split sentences and dicts
            ind = list(range(len(sentences)))
            ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2])
            ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1])

        # Map indices to data
        train = np.array([sentences[x] for x in ind_train])
        test = np.array([sentences[x] for x in ind_test])
        val = np.array([sentences[x] for x in ind_val])

        info_train = np.array([info_dicts[x] for x in ind_train])
        info_test = np.array([info_dicts[x] for x in ind_test])
        info_val = np.array([info_dicts[x] for x in ind_val])

        added = 0
        # Extend vocabulary with training set tokens
        if extend_with > 0:
            wg = WordGenerator(train)
            vb = VocabBuilder(wg)
            vb.count_all_words()
            added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with)

        # Wrap results
        result = [self.tokenize_sentences(s)[0] for s in [train, val, test]]
        result_infos = [info_train, info_val, info_test]
        # if type(result_infos[0][0]) in [np.double, np.float, np.int64, np.int32, np.uint8]:
        #     result_infos = [torch.from_numpy(label).long() for label in result_infos]

        return result, result_infos, added

Source File: make_dataset.py From gnn-benchmark with MIT License

4 votes

def get_train_val_test_split(random_state,
                             labels,
                             train_examples_per_class=None, val_examples_per_class=None,
                             test_examples_per_class=None,
                             train_size=None, val_size=None, test_size=None):
    num_samples, num_classes = labels.shape
    remaining_indices = list(range(num_samples))

    if train_examples_per_class is not None:
        train_indices = sample_per_class(random_state, labels, train_examples_per_class)
    else:
        # select train examples with no respect to class distribution
        train_indices = random_state.choice(remaining_indices, train_size, replace=False)

    if val_examples_per_class is not None:
        val_indices = sample_per_class(random_state, labels, val_examples_per_class, forbidden_indices=train_indices)
    else:
        remaining_indices = np.setdiff1d(remaining_indices, train_indices)
        val_indices = random_state.choice(remaining_indices, val_size, replace=False)

    forbidden_indices = np.concatenate((train_indices, val_indices))
    if test_examples_per_class is not None:
        test_indices = sample_per_class(random_state, labels, test_examples_per_class,
                                        forbidden_indices=forbidden_indices)
    elif test_size is not None:
        remaining_indices = np.setdiff1d(remaining_indices, forbidden_indices)
        test_indices = random_state.choice(remaining_indices, test_size, replace=False)
    else:
        test_indices = np.setdiff1d(remaining_indices, forbidden_indices)

    # assert that there are no duplicates in sets
    assert len(set(train_indices)) == len(train_indices)
    assert len(set(val_indices)) == len(val_indices)
    assert len(set(test_indices)) == len(test_indices)
    # assert sets are mutually exclusive
    assert len(set(train_indices) - set(val_indices)) == len(set(train_indices))
    assert len(set(train_indices) - set(test_indices)) == len(set(train_indices))
    assert len(set(val_indices) - set(test_indices)) == len(set(val_indices))
    if test_size is None and test_examples_per_class is None:
        # all indices must be part of the split
        assert len(np.concatenate((train_indices, val_indices, test_indices))) == num_samples

    if train_examples_per_class is not None:
        train_labels = labels[train_indices, :]
        train_sum = np.sum(train_labels, axis=0)
        # assert all classes have equal cardinality
        assert np.unique(train_sum).size == 1

    if val_examples_per_class is not None:
        val_labels = labels[val_indices, :]
        val_sum = np.sum(val_labels, axis=0)
        # assert all classes have equal cardinality
        assert np.unique(val_sum).size == 1

    if test_examples_per_class is not None:
        test_labels = labels[test_indices, :]
        test_sum = np.sum(test_labels, axis=0)
        # assert all classes have equal cardinality
        assert np.unique(test_sum).size == 1

    return train_indices, val_indices, test_indices

Source File: sentence_tokenizer.py From DeepMoji with MIT License

4 votes

def split_train_val_test(self, sentences, info_dicts,
                             split_parameter=[0.7, 0.1, 0.2], extend_with=0):
        """ Splits given sentences into three different datasets: training,
            validation and testing.

        # Arguments:
            sentences: The sentences to be tokenized.
            info_dicts: A list of dicts that contain information about each
                sentence (e.g. a label).
            split_parameter: A parameter for deciding the splits between the
                three different datasets. If instead of being passed three
                values, three lists are passed, then these will be used to
                specify which observation belong to which dataset.
            extend_with: An optional parameter. If > 0 then this is the number
                of tokens added to the vocabulary from this dataset. The
                expanded vocab will be generated using only the training set,
                but is applied to all three sets.

        # Returns:
            List of three lists of tokenized sentences,

            List of three corresponding dictionaries with information,

            How many tokens have been added to the vocab. Make sure to extend
            the embedding layer of the model accordingly.
        """

        # If passed three lists, use those directly
        if isinstance(split_parameter, list) and \
                all(isinstance(x, list) for x in split_parameter) and \
                len(split_parameter) == 3:

            # Helper function to verify provided indices are numbers in range
            def verify_indices(inds):
                return list(filter(lambda i: isinstance(i, numbers.Number) and
                                   i < len(sentences), inds))

            ind_train = verify_indices(split_parameter[0])
            ind_val = verify_indices(split_parameter[1])
            ind_test = verify_indices(split_parameter[2])
        else:
            # Split sentences and dicts
            ind = list(range(len(sentences)))
            ind_train, ind_test = train_test_split(ind, test_size=split_parameter[2])
            ind_train, ind_val = train_test_split(ind_train, test_size=split_parameter[1])

        # Map indices to data
        train = np.array([sentences[x] for x in ind_train])
        test = np.array([sentences[x] for x in ind_test])
        val = np.array([sentences[x] for x in ind_val])

        info_train = np.array([info_dicts[x] for x in ind_train])
        info_test = np.array([info_dicts[x] for x in ind_test])
        info_val = np.array([info_dicts[x] for x in ind_val])

        added = 0
        # Extend vocabulary with training set tokens
        if extend_with > 0:
            wg = WordGenerator(train)
            vb = VocabBuilder(wg)
            vb.count_all_words()
            added = extend_vocab(self.vocabulary, vb, max_tokens=extend_with)

        # Wrap results
        result = [self.tokenize_sentences(s)[0] for s in [train, val, test]]
        result_infos = [info_train, info_val, info_test]

        return result, result_infos, added

Source File: stability_serializer.py From tape-neurips2019 with MIT License

4 votes

def make_train_val_test_split(rd1, rd2, rd3, rd4, ssm2):
    # any rd4 sequence derived from any other previous sequence is out
    # EEHEE_rd1_0001.pdb
    # EEHEE_rd1_0001.pdb_hp
    # EEHEE_rd1_0001.pdb_random
    # other modifiers include
    # '_PG_hp'
    # '_buryD'
    # '_PG_hp_prottest_XXX'

    base_name = rd1['name'].str.split('.', n=1, expand=True)
    rd1['base_name'] = base_name[0]
    topology = rd1['base_name'].str.split('_', n=1, expand=True)
    rd1['topology'] = topology[0]

    base_name = rd2['name'].str.split('.', n=1, expand=True)
    rd2['base_name'] = base_name[0]
    topology = rd2['base_name'].str.split('_', n=1, expand=True)
    rd2['topology'] = topology[0]

    base_name = rd3['name'].str.split('.', n=1, expand=True)
    rd3['base_name'] = base_name[0]
    topology = rd3['base_name'].str.split('_', n=1, expand=True)
    rd3['topology'] = topology[0]

    base_name = rd4['name'].str.split('.', n=1, expand=True)
    rd4['base_name'] = base_name[0]
    topology = rd4['base_name'].str.split('_', n=1, expand=True)
    rd4['topology'] = topology[0]

    base_name = ssm2['name'].str.split('.', n=1, expand=True)
    ssm2['base_name'] = base_name[0]
    topology = ssm2['base_name'].str.split('_', n=1, expand=True)
    ssm2['topology'] = topology[0]

    # need to filter out all sequences from val based on the original ones...
    all_base = list(rd1.base_name.values)
    all_base.extend(rd2.base_name.values)
    all_base.extend(rd3.base_name.values)

    train = rd1
    train = train.append(rd2)
    train = train.append(rd3)

    # filter 1552 sequences that appear in training already
    train = train.append(rd4[rd4['base_name'].isin(all_base)])
    # 18145 remaining
    val_set = rd4[~rd4['base_name'].isin(all_base)]

    validation = pd.DataFrame()
    for topology in designed_topologies:
        top_set = val_set[val_set['topology'] == topology]
        # pick 200 base sequences for val
        base_seqs = np.random.choice(top_set.base_name.values, size=200)
        # use the base sequences + controls (buryD, PG_hp) for validation ~500
        val_for_topology = top_set[top_set['base_name'].isin(base_seqs)]
        validation = validation.append(val_for_topology)
        print('validation for topology {}'.format(topology))
        print(val_for_topology.shape[0])
        to_train = top_set[~top_set['base_name'].isin(base_seqs)]
        print(to_train.shape[0])
        train = train.append(to_train)

    # 5k more to train on that are not part of the designed topologies
    train = train.append(val_set[~val_set['topology'].isin(designed_topologies)])

    test = ssm2
    return train, validation, test

Source File: train_test_split_utils.py From deep-molecular-massspec with Apache License 2.0

4 votes

def make_train_val_test_split_inchikey_lists(train_inchikey_list,
                                             train_inchikey_dict,
                                             train_val_test_split_fractions,
                                             holdout_inchikey_list=None,
                                             splitting_type='random'):
  """Given inchikey lists, returns lists to use for train/val/test sets.

  If holdout_inchikey_list is given, the inchikeys in this list will be excluded
  from the returned train/validation/test lists.

  Args:
    train_inchikey_list : List of inchikeys to use for train/val/test sets
    train_inchikey_dict : Main dict keyed by inchikeys, values are lists of
        rdkit.Mol. Note that train_inchikey_dict.keys() != train_inchikey_list
        train_inchikey_dict will have many more keys than are in the list.
    train_val_test_split_fractions : a TrainValTestFractions tuple
    holdout_inchikey_list : List of inchikeys to exclude from train/val/test
                            sets.
    splitting_type : method of splitting molecules into train/val/test sets.
  Returns:
    A TrainValTestInchikeys namedtuple
  Raises:
    ValueError : if not train_val_test_split_sizes XOR
                        train_val_test_split_fractions
                 or if specify a splitting_type that isn't implemented yet.
  """
  if not np.isclose([sum(train_val_test_split_fractions)], [1.0]):
    raise ValueError('Must specify train_val_test_split that sums to 1.0')

  if holdout_inchikey_list:
    # filter out those inchikeys that are in the holdout set.
    train_inchikey_list = [
        ikey for ikey in train_inchikey_list
        if ikey not in holdout_inchikey_list
    ]

  if splitting_type == 'random':
    return get_random_inchikeys(train_inchikey_list,
                                train_val_test_split_fractions)
  else:
    # Assume that splitting_type is the name of a structure family.
    # get_inchikeys_by_family will throw an error if this is not supported.
    return get_inchikeys_by_family(
        train_inchikey_list,
        train_inchikey_dict,
        train_val_test_split_fractions,
        family_name=splitting_type,
        exclude_from_train=True)

Source File: utils.py From gnn-meta-attack with MIT License

4 votes

def train_val_test_split_tabular(*arrays, train_size=0.5, val_size=0.3, test_size=0.2, stratify=None,
                                 random_state=None):

    """
    Split the arrays or matrices into random train, validation and test subsets.
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
            Allowed inputs are lists, numpy arrays or scipy-sparse matrices.
    train_size : float, default 0.5
        Proportion of the dataset included in the train split.
    val_size : float, default 0.3
        Proportion of the dataset included in the validation split.
    test_size : float, default 0.2
        Proportion of the dataset included in the test split.
    stratify : array-like or None, default None
        If not None, data is split in a stratified fashion, using this as the class labels.
    random_state : int or None, default None
        Random_state is the seed used by the random number generator;
    Returns
    -------
    splitting : list, length=3 * len(arrays)
        List containing train-validation-test split of inputs.
    """
    if len(set(array.shape[0] for array in arrays)) != 1:
        raise ValueError("Arrays must have equal first dimension.")
    idx = np.arange(arrays[0].shape[0])
    idx_train_and_val, idx_test = train_test_split(idx,
                                                   random_state=random_state,
                                                   train_size=(train_size + val_size),
                                                   test_size=test_size,
                                                   stratify=stratify)
    if stratify is not None:
        stratify = stratify[idx_train_and_val]
    idx_train, idx_val = train_test_split(idx_train_and_val,
                                          random_state=random_state,
                                          train_size=(train_size / (train_size + val_size)),
                                          test_size=(val_size / (train_size + val_size)),
                                          stratify=stratify)
    result = []
    for X in arrays:
        result.append(X[idx_train])
        result.append(X[idx_val])
        result.append(X[idx_test])
    return result

Source File: utils.py From nettack with MIT License

4 votes

def train_val_test_split_tabular(*arrays, train_size=0.5, val_size=0.3, test_size=0.2, stratify=None, random_state=None):

    """
    Split the arrays or matrices into random train, validation and test subsets.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
            Allowed inputs are lists, numpy arrays or scipy-sparse matrices.
    train_size : float, default 0.5
        Proportion of the dataset included in the train split.
    val_size : float, default 0.3
        Proportion of the dataset included in the validation split.
    test_size : float, default 0.2
        Proportion of the dataset included in the test split.
    stratify : array-like or None, default None
        If not None, data is split in a stratified fashion, using this as the class labels.
    random_state : int or None, default None
        Random_state is the seed used by the random number generator;

    Returns
    -------
    splitting : list, length=3 * len(arrays)
        List containing train-validation-test split of inputs.

    """
    if len(set(array.shape[0] for array in arrays)) != 1:
        raise ValueError("Arrays must have equal first dimension.")
    idx = np.arange(arrays[0].shape[0])
    idx_train_and_val, idx_test = train_test_split(idx,
                                                   random_state=random_state,
                                                   train_size=(train_size + val_size),
                                                   test_size=test_size,
                                                   stratify=stratify)
    if stratify is not None:
        stratify = stratify[idx_train_and_val]
    idx_train, idx_val = train_test_split(idx_train_and_val,
                                          random_state=random_state,
                                          train_size=(train_size / (train_size + val_size)),
                                          test_size=(val_size / (train_size + val_size)),
                                          stratify=stratify)
    result = []
    for X in arrays:
        result.append(X[idx_train])
        result.append(X[idx_val])
        result.append(X[idx_test])
    return result