Python Examples of sklearn.utils.class_weight.compute_class

Source File: cnn_class.py From eyenet with MIT License

7 votes

def split_data(self, y_file_path, X, test_data_size=0.2):
        """
        Split data into test and training data sets.

        INPUT
            y_file_path: path to CSV containing labels
            X: NumPy array of arrays
            test_data_size: size of test/train split. Value from 0 to 1

        OUTPUT
            Four arrays: X_train, X_test, y_train, and y_test
        """
        # labels = pd.read_csv(y_file_path, nrows=60)
        labels = pd.read_csv(y_file_path)
        self.X = np.load(X)
        self.y = np.array(labels['level'])
        self.weights = class_weight.compute_class_weight('balanced', np.unique(self.y), self.y)
        self.test_data_size = test_data_size
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=self.test_data_size,
                                                                                random_state=42)

Source File: test_class_weight.py From twitter-stock-recommendation with MIT License

6 votes

def test_compute_class_weight_balanced_negative():
    # Test compute_class_weight when labels are negative
    # Test with balanced class labels.
    classes = np.array([-2, -1, 0])
    y = np.asarray([-1, -1, 0, 0, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    assert_array_almost_equal(cw, np.array([1., 1., 1.]))

    # Test with unbalanced class labels.
    y = np.asarray([-1, 0, 0, -2, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    class_counts = np.bincount(y + 2)
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2. / 3, 2., 1.])

Source File: test_class_weight.py From twitter-stock-recommendation with MIT License

6 votes

def test_compute_class_weight_dict():
    classes = np.arange(3)
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
    y = np.asarray([0, 0, 1, 2])
    cw = compute_class_weight(class_weights, classes, y)

    # When the user specifies class weights, compute_class_weights should just
    # return them.
    assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)

    # When a class weight is specified that isn't in classes, a ValueError
    # should get raised
    msg = 'Class label 4 not present.'
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y)
    msg = 'Class label -1 not present.'
    class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y)

Source File: test_class_weight.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_compute_class_weight_dict():
    classes = np.arange(3)
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
    y = np.asarray([0, 0, 1, 2])
    cw = compute_class_weight(class_weights, classes, y)

    # When the user specifies class weights, compute_class_weights should just
    # return them.
    assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)

    # When a class weight is specified that isn't in classes, a ValueError
    # should get raised
    msg = 'Class label 4 not present.'
    class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y)
    msg = 'Class label -1 not present.'
    class_weights = {-1: 5.0, 0: 1.0, 1: 2.0, 2: 3.0}
    assert_raise_message(ValueError, msg, compute_class_weight, class_weights,
                         classes, y)

Source File: conv_net.py From arxiv-twitterbot with MIT License

6 votes

def fit(self, batch_size, epochs, save_best_model_to_filepath=None):
        checkpoint = ModelCheckpoint(save_best_model_to_filepath,
                                     monitor='val_acc', verbose=1,
                                     save_best_only=True, mode='max')

        weights = class_weight.compute_class_weight('balanced', np.unique(self.train_labels),
                                                    self.train_labels)
        weights[1] = weights[1] * 5
        # Fit the model
        self.model.fit(self.x_train, self.y_train,
                       batch_size=batch_size,
                       epochs=epochs,
                       class_weight=None if not self.balance_classes else weights,
                       callbacks=[checkpoint] if save_best_model_to_filepath is not None else [],
                       validation_data=[self.x_test, self.y_test])
        if save_best_model_to_filepath:
            self.model = load_model(save_best_model_to_filepath)
        return self.model

Source File: cnn_class.py From AI_in_Medicine_Clinical_Imaging_Classification with MIT License

6 votes

def split_data(self, y_file_path, X, test_data_size=0.2):
        """
        Split data into test and training data sets.

        INPUT
            y_file_path: path to CSV containing labels
            X: NumPy array of arrays
            test_data_size: size of test/train split. Value from 0 to 1

        OUTPUT
            Four arrays: X_train, X_test, y_train, and y_test
        """
        # labels = pd.read_csv(y_file_path, nrows=60)
        labels = pd.read_csv(y_file_path)
        self.X = np.load(X)
        self.y = np.array(labels['level'])
        self.weights = class_weight.compute_class_weight('balanced', np.unique(self.y), self.y)
        self.test_data_size = test_data_size
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=self.test_data_size,
                                                                                random_state=42)

Source File: dataFunctions.py From baseline with Apache License 2.0

6 votes

def calculate_class_weights(params):
    """
    Computes the class weights for the training data and writes out to a json file 
    :param params: global parameters, used to find location of the dataset and json file
    :return: 
    """
    
    counts = {}
    for i in range(0,params.num_labels):
        counts[i] = 0

    trainingData = json.load(open(params.files['training_struct']))

    ytrain = []
    for i,currData in enumerate(trainingData):
        ytrain.append(currData['category'])
        counts[currData['category']] += 1
        print(i)

    classWeights = class_weight.compute_class_weight('balanced', np.unique(ytrain), np.array(ytrain))

    with open(params.files['class_weight'], 'w') as json_file:
        json.dump(classWeights.tolist(), json_file)

Source File: zil.py From incremental_learning.pytorch with MIT License

5 votes

def get_class_weights_raw(self, targets):
        unique_targets = np.unique(targets)
        class_weights = compute_class_weight('balanced', unique_targets, targets)
        return torch.tensor(class_weights).to(self._device).float()

    # -----------
    # Constraints
    # -----------

Source File: test_class_weight.py From twitter-stock-recommendation with MIT License

5 votes

def test_compute_class_weight_balanced_unordered():
    # Test compute_class_weight when classes are unordered
    classes = np.array([1, 0, 3])
    y = np.asarray([1, 0, 0, 3, 3, 3])

    cw = compute_class_weight("balanced", classes, y)
    class_counts = np.bincount(y)[classes]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2., 1., 2. / 3])

Source File: test_class_weight.py From twitter-stock-recommendation with MIT License

5 votes

def test_compute_class_weight_not_present():
    # Raise error when y does not contain all class labels
    classes = np.arange(4)
    y = np.asarray([0, 0, 0, 1, 1, 2])
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    # Fix exception in error message formatting when missing label is a string
    # https://github.com/scikit-learn/scikit-learn/issues/8312
    assert_raise_message(ValueError,
                         'Class label label_not_present not present',
                         compute_class_weight,
                         {'label_not_present': 1.}, classes, y)
    # Raise error when y has items not in classes
    classes = np.arange(2)
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    assert_raises(ValueError, compute_class_weight, {0: 1., 1: 2.}, classes, y)

Source File: test_class_weight.py From twitter-stock-recommendation with MIT License

5 votes

def test_compute_class_weight():
    # Test (and demo) compute_class_weight.
    y = np.asarray([2, 2, 2, 3, 3, 4])
    classes = np.unique(y)

    cw = compute_class_weight("balanced", classes, y)
    # total effect of samples is preserved
    class_counts = np.bincount(y)[2:]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_true(cw[0] < cw[1] < cw[2])

Source File: classifier_selection.py From causallib with Apache License 2.0

5 votes

def _select_classifier_from_list(candidates, X, A, n_splits=5, seed=None, loss_type='01'):
    accuracies = np.zeros(len(candidates))

    class_weight = compute_class_weight('balanced', np.unique(A), A)[LabelEncoder().fit_transform(A)]

    if n_splits >= 2:
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
        for model_idx, m in enumerate(candidates):
            if loss_type == '01':
                pred = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight}).reshape(-1)
            else:
                ps = cross_val_predict(m, X=X, y=A, cv=cv, fit_params={'sample_weight': class_weight},
                                       method='predict_proba')
                pred = ps[:, 1]
    else:
        for model_idx, m in enumerate(candidates):
            m.fit(X, A, sample_weight=class_weight)
            if loss_type == '01':
                pred = m.predict(X=X)
            else:
                pred = m.predict_proba(X=X)[:, 1]

    if loss_type == '01':
        accuracies[model_idx] = np.sum(class_weight[pred == A]) / np.sum(class_weight)
    else:
        logl = np.zeros(A.shape)
        logl[A == -1] = np.log(1.0 - pred[A == -1])
        logl[A == 1] = np.log(pred[A == 1])
        accuracies[model_idx] = np.sum(class_weight * logl) / np.sum(class_weight)

    i_best = np.argmax(accuracies)
    # print('accuracies =', accuracies, "accuracies-sorted", sorted(accuracies))
    # print('Selected model {} {}'.format(i_best, candidates[i_best]))
    return candidates[i_best]

Source File: generate_class_weights.py From Pytorch-Project-Template with MIT License

5 votes

def calculate_weigths_labels():
    class Config:
        mode = "train"
        num_classes = 21
        batch_size = 32
        max_epoch = 150

        validate_every = 2

        checkpoint_file = "checkpoint.pth.tar"

        data_loader = "VOCDataLoader"
        data_root = "../data/pascal_voc_seg/"
        data_loader_workers = 4
        pin_memory = True
        async_loading = True

    # Create an instance from the data loader
    from tqdm import tqdm
    data_loader = VOCDataLoader(Config)
    z = np.zeros((Config.num_classes,))
    # Initialize tqdm
    tqdm_batch = tqdm(data_loader.train_loader, total=data_loader.train_iterations)

    for _, y in tqdm_batch:
        labels = y.numpy().astype(np.uint8).ravel().tolist()
        z += np.bincount(labels, minlength=Config.num_classes)
    tqdm_batch.close()
    # ret = compute_class_weight(class_weight='balanced', classes=np.arange(21), y=np.asarray(labels, dtype=np.uint8))
    total_frequency = np.sum(z)
    print(z)
    print(total_frequency)
    class_weights = []
    for frequency in z:
        class_weight = 1 / (np.log(1.02 + (frequency / total_frequency)))
        class_weights.append(class_weight)
    ret = np.array(class_weights)
    np.save('../pretrained_weights/voc2012_256_class_weights', ret)
    print(ret)

Source File: zil.py From incremental_learning.pytorch with MIT License

5 votes

def get_class_weights(self, loader):
        targets = []
        for input_dict in loader:
            targets.append(input_dict["targets"])
        targets = torch.cat(targets).cpu().numpy()
        unique_targets = np.unique(targets)
        class_weights = compute_class_weight('balanced', unique_targets, targets)
        return torch.tensor(class_weights).to(self._device).float()

Source File: losses.py From pytorch_segmentation with MIT License

5 votes

def get_weights(target):
    t_np = target.view(-1).data.cpu().numpy()

    classes, counts = np.unique(t_np, return_counts=True)
    cls_w = np.median(counts) / counts
    #cls_w = class_weight.compute_class_weight('balanced', classes, t_np)

    weights = np.ones(7)
    weights[classes] = cls_w
    return torch.from_numpy(weights).float().cuda()

Source File: Keras_utils.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def calculate_class_weight(y_train, no_classes=2):
    # https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras
    from sklearn.utils import class_weight

    class_weight_list = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
    class_weights = {}
    for i in range(no_classes):
        class_weights[i] = class_weight_list[i]
    print(class_weights)
    return class_weights

Source File: test_class_weight.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_compute_class_weight_balanced_unordered():
    # Test compute_class_weight when classes are unordered
    classes = np.array([1, 0, 3])
    y = np.asarray([1, 0, 0, 3, 3, 3])

    cw = compute_class_weight("balanced", classes, y)
    class_counts = np.bincount(y)[classes]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2., 1., 2. / 3])

Source File: test_class_weight.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_compute_class_weight_balanced_negative():
    # Test compute_class_weight when labels are negative
    # Test with balanced class labels.
    classes = np.array([-2, -1, 0])
    y = np.asarray([-1, -1, 0, 0, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    assert_array_almost_equal(cw, np.array([1., 1., 1.]))

    # Test with unbalanced class labels.
    y = np.asarray([-1, 0, 0, -2, -2, -2])

    cw = compute_class_weight("balanced", classes, y)
    assert_equal(len(cw), len(classes))
    class_counts = np.bincount(y + 2)
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert_array_almost_equal(cw, [2. / 3, 2., 1.])

Source File: test_class_weight.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_compute_class_weight_not_present():
    # Raise error when y does not contain all class labels
    classes = np.arange(4)
    y = np.asarray([0, 0, 0, 1, 1, 2])
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    # Fix exception in error message formatting when missing label is a string
    # https://github.com/scikit-learn/scikit-learn/issues/8312
    assert_raise_message(ValueError,
                         'Class label label_not_present not present',
                         compute_class_weight,
                         {'label_not_present': 1.}, classes, y)
    # Raise error when y has items not in classes
    classes = np.arange(2)
    assert_raises(ValueError, compute_class_weight, "balanced", classes, y)
    assert_raises(ValueError, compute_class_weight, {0: 1., 1: 2.}, classes, y)

Source File: test_class_weight.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_compute_class_weight():
    # Test (and demo) compute_class_weight.
    y = np.asarray([2, 2, 2, 3, 3, 4])
    classes = np.unique(y)

    cw = compute_class_weight("balanced", classes, y)
    # total effect of samples is preserved
    class_counts = np.bincount(y)[2:]
    assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
    assert cw[0] < cw[1] < cw[2]

Source File: data_silo.py From FARM with Apache License 2.0

5 votes

def calculate_class_weights(self, task_name, source="train"):
        """ For imbalanced datasets, we can calculate class weights that can be used later in the
        loss function of the prediction head to upweight the loss of minorities.

        :param task_name: name of the task as used in the processor
        :type task_name: str
        """
        
        tensor_name = self.processor.tasks[task_name]["label_tensor_name"]
        label_list = self.processor.tasks[task_name]["label_list"]
        tensor_idx = list(self.tensor_names).index(tensor_name)
        # we need at least ONE observation for each label to avoid division by zero in compute_class_weights.
        observed_labels = copy.deepcopy(label_list)
        if source == "all":
            datasets = self.data.values()
        elif source == "train":
            datasets = [self.data["train"]]
        else:
            raise Exception("source argument expects one of [\"train\", \"all\"]")
        for dataset in datasets:
            if "multilabel" in self.processor.tasks[task_name]["task_type"]:
                for x in dataset:
                    observed_labels += [label_list[label_id] for label_id in (x[tensor_idx] == 1).nonzero()]
            else:
                observed_labels += [label_list[x[tensor_idx].item()] for x in dataset]

        #TODO scale e.g. via logarithm to avoid crazy spikes for rare classes
        class_weights = compute_class_weight("balanced", np.asarray(label_list), observed_labels)
        # conversion necessary to have class weights of same type as model weights
        class_weights = class_weights.astype(np.float32)
        return class_weights

Python sklearn.utils.class_weight.compute_class_weight() Examples