Python sklearn.utils.class_weight.compute_class_weight() Examples

The following are 21 code examples of sklearn.utils.class_weight.compute_class_weight(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.utils.class_weight , or try the search function .
Example #1
Source File: cnn_class.py    From eyenet with MIT License 7 votes vote down vote up
def split_data(self, y_file_path, X, test_data_size=0.2):
        """
        Split data into test and training data sets.

        INPUT
            y_file_path: path to CSV containing labels
            X: NumPy array of arrays
            test_data_size: size of test/train split. Value from 0 to 1

        OUTPUT
            Four arrays: X_train, X_test, y_train, and y_test
        """
        # labels = pd.read_csv(y_file_path, nrows=60)
        labels = pd.read_csv(y_file_path)
        self.X = np.load(X)
        self.y = np.array(labels['level'])
        self.weights = class_weight.compute_class_weight('balanced', np.unique(self.y), self.y)
        self.test_data_size = test_data_size
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=self.test_data_size,
                                                                                random_state=42) 
Example #2
Source File: test_class_weight.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_compute_class_weight_balanced_negative():
    # Test compute_class_weight when labels are negative
    # Test with balanced class labels.
    classes = np.array([-2, -1, 0])
    y = np.asarray([-1, -1, 0, 0, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    assert_array_almost_equal(cw, np.array([1., 1., 1.]))

    # Test with unbalanced class labels.
    y = np.asarray([-1, 0, 0, -2, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    class_counts = np.bincount(y + 2)
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2. / 3, 2., 1.]) 
Example #3
Source File: test_class_weight.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_compute_class_weight_dict():
    classes = np.arange(3)
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
    y = np.asarray([0, 0, 1, 2])
    cw = compute_class_weight(class_weights, classes, y)

    # When the user specifies class weights, compute_class_weights should just
    # return them.
    assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)

    # When a class weight is specified that isn't in classes, a ValueError
    # should get raised
    msg = 'Class label 4 not present.'
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y)
    msg = 'Class label -1 not present.'
    class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y) 
Example #4
Source File: test_class_weight.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_compute_class_weight_dict():
    classes = np.arange(3)
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
    y = np.asarray([0, 0, 1, 2])
    cw = compute_class_weight(class_weights, classes, y)

    # When the user specifies class weights, compute_class_weights should just
    # return them.
    assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)

    # When a class weight is specified that isn't in classes, a ValueError
    # should get raised
    msg = 'Class label 4 not present.'
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y)
    msg = 'Class label -1 not present.'
    class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y) 
Example #5
Source File: conv_net.py    From arxiv-twitterbot with MIT License 6 votes vote down vote up
def fit(self, batch_size, epochs, save_best_model_to_filepath=None):
        checkpoint = ModelCheckpoint(save_best_model_to_filepath,
                                     monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')

        weights = class_weight.compute_class_weight('balanced', np.unique(self.train_labels),
                                                    self.train_labels)
        weights[1] = weights[1] * 5
        # Fit the model
        self.model.fit(self.x_train, self.y_train,
                       batch_size=batch_size,
                       epochs=epochs,
                       class_weight=None if not self.balance_classes else weights,
                       callbacks=[checkpoint] if save_best_model_to_filepath is not None else [],
                       validation_data=[self.x_test, self.y_test])
        if save_best_model_to_filepath:
            self.model = load_model(save_best_model_to_filepath)
        return self.model 
Example #6
Source File: cnn_class.py    From AI_in_Medicine_Clinical_Imaging_Classification with MIT License 6 votes vote down vote up
def split_data(self, y_file_path, X, test_data_size=0.2):
        """
        Split data into test and training data sets.

        INPUT
            y_file_path: path to CSV containing labels
            X: NumPy array of arrays
            test_data_size: size of test/train split. Value from 0 to 1

        OUTPUT
            Four arrays: X_train, X_test, y_train, and y_test
        """
        # labels = pd.read_csv(y_file_path, nrows=60)
        labels = pd.read_csv(y_file_path)
        self.X = np.load(X)
        self.y = np.array(labels['level'])
        self.weights = class_weight.compute_class_weight('balanced', np.unique(self.y), self.y)
        self.test_data_size = test_data_size
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=self.test_data_size,
                                                                                random_state=42) 
Example #7
Source File: dataFunctions.py    From baseline with Apache License 2.0 6 votes vote down vote up
def calculate_class_weights(params):
    """
    Computes the class weights for the training data and writes out to a json file 
    :param params: global parameters, used to find location of the dataset and json file
    :return: 
    """
    
    counts = {}
    for i in range(0,params.num_labels):
        counts[i] = 0

    trainingData = json.load(open(params.files['training_struct']))

    ytrain = []
    for i,currData in enumerate(trainingData):
        ytrain.append(currData['category'])
        counts[currData['category']] += 1
        print(i)

    classWeights = class_weight.compute_class_weight('balanced', np.unique(ytrain), np.array(ytrain))

    with open(params.files['class_weight'], 'w') as json_file:
        json.dump(classWeights.tolist(), json_file) 
Example #8
Source File: zil.py    From incremental_learning.pytorch with MIT License 5 votes vote down vote up
def get_class_weights_raw(self, targets):
        unique_targets = np.unique(targets)
        class_weights = compute_class_weight('balanced', unique_targets, targets)
        return torch.tensor(class_weights).to(self._device).float()

    # -----------
    # Constraints
    # ----------- 
Example #9
Source File: test_class_weight.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_compute_class_weight_balanced_unordered():
    # Test compute_class_weight when classes are unordered
    classes = np.array([1, 0, 3])
    y = np.asarray([1, 0, 0, 3, 3, 3])

    cw = compute_class_weight("balanced", classes, y)
    class_counts = np.bincount(y)[classes]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2., 1., 2. / 3]) 
Example #10
Source File: test_class_weight.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_compute_class_weight_not_present():
    # Raise error when y does not contain all class labels
    classes = np.arange(4)
    y = np.asarray([0, 0, 0, 1, 1, 2])
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    # Fix exception in error message formatting when missing label is a string
    # https://github.com/scikit-learn/scikit-learn/issues/8312
    assert_raise_message(ValueError,
                         'Class label label_not_present not present',
                         compute_class_weight,
                         {'label_not_present': 1.}, classes, y)
    # Raise error when y has items not in classes
    classes = np.arange(2)
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    assert_raises(ValueError, compute_class_weight, {0: 1., 1: 2.}, classes, y) 
Example #11
Source File: test_class_weight.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_compute_class_weight():
    # Test (and demo) compute_class_weight.
    y = np.asarray([2, 2, 2, 3, 3, 4])
    classes = np.unique(y)

    cw = compute_class_weight("balanced", classes, y)
    # total effect of samples is preserved
    class_counts = np.bincount(y)[2:]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_true(cw[0] < cw[1] < cw[2]) 
Example #12
Source File: classifier_selection.py    From causallib with Apache License 2.0 5 votes vote down vote up
def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'):
    accuracies = np.zeros(len(candidates))

    class_weight = compute_class_weight('balanced', np.unique(A), A)[LabelEncoder().fit_transform(A)]

    if n_splits >= 2:
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for model_idx, m in enumerate(candidates):
            if loss_type == '01':
                pred = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight}).reshape(-1)
            else:
                ps = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight},
                                       method='predict_proba')
                pred = ps[:, 1]
    else:
        for model_idx, m in enumerate(candidates):
            m.fit(X, A, sample_weight=class_weight)
            if loss_type == '01':
                pred = m.predict(X=X)
            else:
                pred = m.predict_proba(X=X)[:, 1]

    if loss_type == '01':
        accuracies[model_idx] = np.sum(class_weight[pred == A]) / np.sum(class_weight)
    else:
        logl = np.zeros(A.shape)
        logl[A == -1] = np.log(1.0 - pred[A == -1])
        logl[A == 1] = np.log(pred[A == 1])
        accuracies[model_idx] = np.sum(class_weight * logl) / np.sum(class_weight)

    i_best = np.argmax(accuracies)
    # print('accuracies =', accuracies, "accuracies-sorted", sorted(accuracies))
    # print('Selected model {} {}'.format(i_best, candidates[i_best]))
    return candidates[i_best] 
Example #13
Source File: generate_class_weights.py    From Pytorch-Project-Template with MIT License 5 votes vote down vote up
def calculate_weigths_labels():
    class Config:
        mode = "train"
        num_classes = 21
        batch_size = 32
        max_epoch = 150

        validate_every = 2

        checkpoint_file = "checkpoint.pth.tar"

        data_loader = "VOCDataLoader"
        data_root = "../data/pascal_voc_seg/"
        data_loader_workers = 4
        pin_memory = True
        async_loading = True

    # Create an instance from the data loader
    from tqdm import tqdm
    data_loader = VOCDataLoader(Config)
    z = np.zeros((Config.num_classes,))
    # Initialize tqdm
    tqdm_batch = tqdm(data_loader.train_loader, total=data_loader.train_iterations)

    for _, y in tqdm_batch:
        labels = y.numpy().astype(np.uint8).ravel().tolist()
        z += np.bincount(labels, minlength=Config.num_classes)
    tqdm_batch.close()
    # ret = compute_class_weight(class_weight='balanced', classes=np.arange(21), y=np.asarray(labels, dtype=np.uint8))
    total_frequency = np.sum(z)
    print(z)
    print(total_frequency)
    class_weights = []
    for frequency in z:
        class_weight = 1 / (np.log(1.02 + (frequency / total_frequency)))
        class_weights.append(class_weight)
    ret = np.array(class_weights)
    np.save('../pretrained_weights/voc2012_256_class_weights', ret)
    print(ret) 
Example #14
Source File: zil.py    From incremental_learning.pytorch with MIT License 5 votes vote down vote up
def get_class_weights(self, loader):
        targets = []
        for input_dict in loader:
            targets.append(input_dict["targets"])
        targets = torch.cat(targets).cpu().numpy()
        unique_targets = np.unique(targets)
        class_weights = compute_class_weight('balanced', unique_targets, targets)
        return torch.tensor(class_weights).to(self._device).float() 
Example #15
Source File: losses.py    From pytorch_segmentation with MIT License 5 votes vote down vote up
def get_weights(target):
    t_np = target.view(-1).data.cpu().numpy()

    classes, counts = np.unique(t_np, return_counts=True)
    cls_w = np.median(counts) / counts
    #cls_w = class_weight.compute_class_weight('balanced', classes, t_np)

    weights = np.ones(7)
    weights[classes] = cls_w
    return torch.from_numpy(weights).float().cuda() 
Example #16
Source File: Keras_utils.py    From coling2018_fake-news-challenge with Apache License 2.0 5 votes vote down vote up
def calculate_class_weight(y_train, no_classes=2):
    # https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras
    from sklearn.utils import class_weight

    class_weight_list = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    class_weights = {}
    for i in range(no_classes):
        class_weights[i] = class_weight_list[i]
    print(class_weights)
    return class_weights 
Example #17
Source File: test_class_weight.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_compute_class_weight_balanced_unordered():
    # Test compute_class_weight when classes are unordered
    classes = np.array([1, 0, 3])
    y = np.asarray([1, 0, 0, 3, 3, 3])

    cw = compute_class_weight("balanced", classes, y)
    class_counts = np.bincount(y)[classes]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2., 1., 2. / 3]) 
Example #18
Source File: test_class_weight.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_compute_class_weight_balanced_negative():
    # Test compute_class_weight when labels are negative
    # Test with balanced class labels.
    classes = np.array([-2, -1, 0])
    y = np.asarray([-1, -1, 0, 0, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    assert_array_almost_equal(cw, np.array([1., 1., 1.]))

    # Test with unbalanced class labels.
    y = np.asarray([-1, 0, 0, -2, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    class_counts = np.bincount(y + 2)
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2. / 3, 2., 1.]) 
Example #19
Source File: test_class_weight.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_compute_class_weight_not_present():
    # Raise error when y does not contain all class labels
    classes = np.arange(4)
    y = np.asarray([0, 0, 0, 1, 1, 2])
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    # Fix exception in error message formatting when missing label is a string
    # https://github.com/scikit-learn/scikit-learn/issues/8312
    assert_raise_message(ValueError,
                         'Class label label_not_present not present',
                         compute_class_weight,
                         {'label_not_present': 1.}, classes, y)
    # Raise error when y has items not in classes
    classes = np.arange(2)
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    assert_raises(ValueError, compute_class_weight, {0: 1., 1: 2.}, classes, y) 
Example #20
Source File: test_class_weight.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_compute_class_weight():
    # Test (and demo) compute_class_weight.
    y = np.asarray([2, 2, 2, 3, 3, 4])
    classes = np.unique(y)

    cw = compute_class_weight("balanced", classes, y)
    # total effect of samples is preserved
    class_counts = np.bincount(y)[2:]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert cw[0] < cw[1] < cw[2] 
Example #21
Source File: data_silo.py    From FARM with Apache License 2.0 5 votes vote down vote up
def calculate_class_weights(self, task_name, source="train"):
        """ For imbalanced datasets, we can calculate class weights that can be used later in the
        loss function of the prediction head to upweight the loss of minorities.

        :param task_name: name of the task as used in the processor
        :type task_name: str
        """
        
        tensor_name = self.processor.tasks[task_name]["label_tensor_name"]
        label_list = self.processor.tasks[task_name]["label_list"]
        tensor_idx = list(self.tensor_names).index(tensor_name)
        # we need at least ONE observation for each label to avoid division by zero in compute_class_weights.
        observed_labels = copy.deepcopy(label_list)
        if source == "all":
            datasets = self.data.values()
        elif source == "train":
            datasets = [self.data["train"]]
        else:
            raise Exception("source argument expects one of [\"train\", \"all\"]")
        for dataset in datasets:
            if "multilabel" in self.processor.tasks[task_name]["task_type"]:
                for x in dataset:
                    observed_labels += [label_list[label_id] for label_id in (x[tensor_idx] == 1).nonzero()]
            else:
                observed_labels += [label_list[x[tensor_idx].item()] for x in dataset]

        #TODO scale e.g. via logarithm to avoid crazy spikes for rare classes
        class_weights = compute_class_weight("balanced", np.asarray(label_list), observed_labels)
        # conversion necessary to have class weights of same type as model weights
        class_weights = class_weights.astype(np.float32)
        return class_weights