Python Examples of sklearn.metrics.cohen_kappa

Source File: cnn.py From Price_Prediction_LOB with MIT License

6 votes

def evaluate(source, source_batch):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    y_true = []  # true labels
    y_pred = []  # predicted labels
    for i in range(len(source_batch)):
        data, targets = get_batch(source, source_batch, i)
        outputs = model(data)
        total_loss += len(targets) * criterion(outputs, targets).data
        _, predicted = torch.max(outputs, 1)
        y_true.extend(targets.tolist())
        y_pred.extend(predicted.tolist())
    val_loss = total_loss.item() / np.size(source_batch)
    # Make report for the classfier
    report = classification_report(y_true, y_pred, target_names=classes)
    kappa = cohen_kappa_score(y_true, y_pred)
    return val_loss, kappa, report

# Loop over epochs

Source File: rnn.py From Price_Prediction_LOB with MIT License

6 votes

def evaluate(source, source_batch):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    y_true = []  # true labels
    y_pred = []  # predicted labels
    hidden = model.init_hidden(args.bsz)
    for i in range(len(source_batch)):

        data, targets = get_batch(source, source_batch, i)
        output, hidden = model(data, hidden)
        total_loss += len(targets) * criterion(output[-1], targets).data
        _, predicted = torch.max(output[-1], 1)
        y_true.extend(targets.tolist())
        y_pred.extend(predicted.tolist())
        hidden = repackage_hidden(hidden)
    val_loss = total_loss.item() / np.size(source_batch)
    # Make report for the classfier
    report = classification_report(y_true, y_pred, target_names=classes)
    kappa = cohen_kappa_score(y_true, y_pred)
    return val_loss, kappa, report

# Loop over epochs

Source File: TransferLearning_reg.py From Intelligent-Projects-Using-Python with MIT License

6 votes

def inference_validation(self,test_X,test_y,model_save_dest,n_class=5,folds=5):
		print(test_X.shape,test_y.shape)
		pred = np.zeros(test_X.shape[0])
		for k in range(1,folds + 1):
			print(f'running inference on fold: {k}')
			model = keras.models.load_model(model_save_dest[k])
			pred = pred + model.predict(test_X)[:,0]
			pred = pred
			print(pred.shape)
			print(pred)
		pred = pred/float(folds)
		pred_class = np.round(pred)
		pred_class = np.array(pred_class,dtype=int)
		pred_class = list(map(lambda x:4 if x > 4 else x,pred_class))
		pred_class = list(map(lambda x:0 if x < 0 else x,pred_class))
		act_class = test_y 
		accuracy = np.sum([pred_class == act_class])*1.0/len(test_X)
		kappa = cohen_kappa_score(pred_class,act_class,weights='quadratic')
		return pred_class,accuracy,kappa

Source File: cnn_class.py From eyenet with MIT License

6 votes

def predict(self):
        """
        Predicts the model output, and computes precision, recall, and F1 score.

        INPUT
            model: Model trained in Keras

        OUTPUT
            Precision, Recall, and F1 score
        """
        predictions = self.model.predict(self.X_test)
        predictions = np.argmax(predictions, axis=1)

        # predictions[predictions >=1] = 1 # Remove when non binary classifier

        self.y_test = np.argmax(self.y_test, axis=1)

        precision = precision_score(self.y_test, predictions, average="micro")
        recall = recall_score(self.y_test, predictions, average="micro")
        f1 = f1_score(self.y_test, predictions, average="micro")
        cohen_kappa = cohen_kappa_score(self.y_test, predictions)
        quad_kappa = kappa(self.y_test, predictions, weights='quadratic')
        return precision, recall, f1, cohen_kappa, quad_kappa

Source File: cnn_class.py From AI_in_Medicine_Clinical_Imaging_Classification with MIT License

6 votes

def predict(self):
        """
        Predicts the model output, and computes precision, recall, and F1 score.

        INPUT
            model: Model trained in Keras

        OUTPUT
            Precision, Recall, and F1 score
        """
        predictions = self.model.predict(self.X_test)
        predictions = np.argmax(predictions, axis=1)

        # predictions[predictions >=1] = 1 # Remove when non binary classifier

        self.y_test = np.argmax(self.y_test, axis=1)

        precision = precision_score(self.y_test, predictions, average="micro")
        recall = recall_score(self.y_test, predictions, average="micro")
        f1 = f1_score(self.y_test, predictions, average="micro")
        cohen_kappa = cohen_kappa_score(self.y_test, predictions)
        quad_kappa = kappa(self.y_test, predictions, weights='quadratic')
        return precision, recall, f1, cohen_kappa, quad_kappa

Source File: metrics.py From MultiPlanarUNet with MIT License

6 votes

def class_wise_kappa(true, pred, n_classes=None, ignore_zero=True):
    from sklearn.metrics import cohen_kappa_score
    if n_classes is None:
        classes = np.unique(true)
    else:
        classes = np.arange(max(2, n_classes))
    # Ignore background class?
    if ignore_zero:
        classes = classes[np.where(classes != 0)]

    # Calculate kappa for all targets
    kappa_scores = np.empty(shape=classes.shape, dtype=np.float32)
    kappa_scores.fill(np.nan)
    for idx, _class in enumerate(classes):
        s1 = true == _class
        s2 = pred == _class

        if np.any(s1) or np.any(s2):
            kappa_scores[idx] = cohen_kappa_score(s1, s2)
    return kappa_scores

Source File: labeled_reviews_comparator.py From yelp with GNU Lesser General Public License v2.1

6 votes

def toy_cohens_kappa():
    # rater1 = [1, 1, 1, 0]
    # rater2 = [1, 1, 0, 0]
    # rater3 = [0, 1, 1]
    rater1 = ['s', 's', 's', 'g', 'u']
    rater2 = ['s', 's', 'g', 'g', 's']

    taskdata = [[0, str(i), str(rater1[i])] for i in range(0, len(rater1))] + [
        [1, str(i), str(rater2[i])] for i in range(0, len(rater2))] # + [
                   # [2, str(i), str(rater3[i])] for i in range(0, len(rater3))]
    print(taskdata)
    ratingtask = agreement.AnnotationTask(data=taskdata)
    print("kappa " + str(ratingtask.kappa()))
    print("fleiss " + str(ratingtask.multi_kappa()))
    print("alpha " + str(ratingtask.alpha()))
    print("scotts " + str(ratingtask.pi()))

    print("sklearn kappa " + str(cohen_kappa_score(rater1, rater2)))

Source File: metrics.py From minetorch with MIT License

5 votes

def _kappa_score(self):
        png_file = self.scalars(
            {'kappa_score': cohen_kappa_score(self.targets, self.predicts, weights='quadratic')}, 'kappa_score'
        )

        if png_file:
            self.update_sheet('kappa_score', {'raw': png_file, 'processor': 'upload_image'})

Source File: test_classification.py From twitter-stock-recommendation with MIT License

5 votes

def test_cohen_kappa():
    # These label vectors reproduce the contingency matrix from Artstein and
    # Poesio (2008), Table 1: np.array([[20, 20], [10, 50]]).
    y1 = np.array([0] * 40 + [1] * 60)
    y2 = np.array([0] * 20 + [1] * 20 + [0] * 10 + [1] * 50)
    kappa = cohen_kappa_score(y1, y2)
    assert_almost_equal(kappa, .348, decimal=3)
    assert_equal(kappa, cohen_kappa_score(y2, y1))

    # Add spurious labels and ignore them.
    y1 = np.append(y1, [2] * 4)
    y2 = np.append(y2, [2] * 4)
    assert_equal(cohen_kappa_score(y1, y2, labels=[0, 1]), kappa)

    assert_almost_equal(cohen_kappa_score(y1, y1), 1.)

    # Multiclass example: Artstein and Poesio, Table 4.
    y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
    y2 = np.array([0] * 52 + [1] * 32 + [2] * 16)
    assert_almost_equal(cohen_kappa_score(y1, y2), .8013, decimal=4)

    # Weighting example: none, linear, quadratic.
    y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
    y2 = np.array([0] * 50 + [1] * 40 + [2] * 10)
    assert_almost_equal(cohen_kappa_score(y1, y2), .9315, decimal=4)
    assert_almost_equal(cohen_kappa_score(y1, y2, weights="linear"), .9412, decimal=4)
    assert_almost_equal(cohen_kappa_score(y1, y2, weights="quadratic"), .9541, decimal=4)

Source File: cnn.py From Price_Prediction_LOB with MIT License

5 votes

def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    y_true = []  # true labels
    y_pred = []  # predicted labels
    start_time = time.time()
    for batch, i in enumerate(range(len(train_batch))):
        data, targets = get_batch(train_data, train_batch, i)
        model.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.data

        _, predicted = torch.max(outputs, 1)
        y_true.extend(targets.tolist())
        y_pred.extend(predicted.tolist())

        if (batch + 1) % args.log_interval == 0:
            cur_loss = total_loss.item() / (batch + 1)
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
                    'loss {:5.2f}'.format(
                epoch, batch + 1, len(train_batch), lr,
                elapsed * 1000 / args.log_interval, cur_loss))
            start_time = time.time()
    # compute Cohen's Kappa
    kappa = cohen_kappa_score(y_true, y_pred)
    return total_loss.item() / (batch + 1), kappa

Source File: rnn.py From Price_Prediction_LOB with MIT License

5 votes

def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    y_true = []  # true labels
    y_pred = []  # predicted labels
    start_time = time.time()
    hidden = model.init_hidden(args.bsz)
    for batch, i in enumerate(range(len(train_batch))):

        data, targets = get_batch(train_data, train_batch, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output[-1], targets)
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        optimizer.step()
        total_loss += loss.data

        _, predicted = torch.max(output[-1], 1)
        y_true.extend(targets.tolist())
        y_pred.extend(predicted.tolist())

        if (batch + 1) % args.log_interval == 0:
            cur_loss = total_loss.item() / (batch + 1)
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.6f} | ms/batch {:5.2f} | '
                    'loss {:5.2f}'.format(
                epoch, batch + 1, len(train_batch), lr,
                elapsed * 1000 / args.log_interval, cur_loss))
            start_time = time.time()
    # compute Cohen's Kappa
    kappa = cohen_kappa_score(y_true, y_pred)
    return total_loss.item() / (batch + 1), kappa

Source File: labeled_reviews_comparator.py From yelp with GNU Lesser General Public License v2.1

5 votes

def cohens_kappa():

    data_folder = '/Users/fpena/UCC/Thesis/datasets/context/manuallyLabeledReviews/'

    business_type = Constants.ITEM_TYPE
    file_name = data_folder + '%s_%s_reviews.json'

    labelers = [
        # 'francisco',
        'diego',
        'mesut',
        'rohit',
    ]

    all_records = [
        load_data(file_name % (labeler, business_type)) for labeler in labelers
    ]

    rater1 = [record['review_type'] for record in all_records[0]]
    rater2 = [record['review_type'] for record in all_records[1]]
    rater3 = [record['review_type'] for record in all_records[2]]

    taskdata = [[0, str(i), str(rater1[i])] for i in range(0, len(rater1))] + [
        [1, str(i), str(rater2[i])] for i in range(0, len(rater2))] + [
                   [2, str(i), str(rater3[i])] for i in range(0, len(rater3))]
    print(taskdata)
    ratingtask = agreement.AnnotationTask(data=taskdata)
    print("Observed agreement " + str(ratingtask.avg_Ao()))
    print("kappa " + str(ratingtask.kappa()))
    print("fleiss " + str(ratingtask.multi_kappa()))
    print("alpha " + str(ratingtask.alpha()))
    print("scotts " + str(ratingtask.pi()))

    print("sklearn kappa " + str(cohen_kappa_score(rater1, rater2)))
    print("sklearn kappa " + str(cohen_kappa_score(rater1, rater3)))
    print("sklearn kappa " + str(cohen_kappa_score(rater2, rater3)))

Source File: metrics.py From mimic3-benchmarks with MIT License

5 votes

def print_metrics_regression(y_true, predictions, verbose=1):
    predictions = np.array(predictions)
    predictions = np.maximum(predictions, 0).flatten()
    y_true = np.array(y_true)

    y_true_bins = [get_bin_custom(x, CustomBins.nbins) for x in y_true]
    prediction_bins = [get_bin_custom(x, CustomBins.nbins) for x in predictions]
    cf = metrics.confusion_matrix(y_true_bins, prediction_bins)
    if verbose:
        print("Custom bins confusion matrix:")
        print(cf)

    kappa = metrics.cohen_kappa_score(y_true_bins, prediction_bins,
                                      weights='linear')
    mad = metrics.mean_absolute_error(y_true, predictions)
    mse = metrics.mean_squared_error(y_true, predictions)
    mape = mean_absolute_percentage_error(y_true, predictions)

    if verbose:
        print("Mean absolute deviation (MAD) = {}".format(mad))
        print("Mean squared error (MSE) = {}".format(mse))
        print("Mean absolute percentage error (MAPE) = {}".format(mape))
        print("Cohen kappa score = {}".format(kappa))

    return {"mad": mad,
            "mse": mse,
            "mape": mape,
            "kappa": kappa}

Source File: TrainOneClassifier.py From kaggle-rsna18 with MIT License

5 votes

def calculate_metrics(val_results_dict, y_pred, y_val, suffix=""): 
    tmp_kappa_list = []
    tmp_accur_list = [] 
    tmp_f1_list = [] 
    tmp_cm_list = []
    y_val = utils.to_categorical(y_val)[:,-1]
    for each_threshold in np.linspace(0.1, 0.9, 17): 
        tmp_pred = [1 if _ >= each_threshold else 0 for _ in y_pred]
        tmp_kappa_list.append(cohen_kappa_score(tmp_pred, y_val))
        tmp_accur_list.append(accuracy_score(tmp_pred, y_val)) 
        tmp_f1_list.append(f1_score(tmp_pred, y_val))
        tmp_cm_list.append(competitionMetric(tmp_pred, y_val))
    auroc = round(roc_auc_score(y_val, y_pred), 3)
    kappa = round(np.max(tmp_kappa_list), 3)
    accur = round(np.max(tmp_accur_list), 3) 
    cm = round(np.max(tmp_cm_list), 3)
    f1 = round(np.max(tmp_f1_list), 3) 
    val_results_dict["auc{}".format(suffix)].append(auroc)
    val_results_dict["kap{}".format(suffix)].append(kappa)
    val_results_dict["acc{}".format(suffix)].append(accur) 
    val_results_dict["f1{}".format(suffix)].append(f1) 
    val_results_dict["cm{}".format(suffix)].append(cm)
    kappa_threshold = np.linspace(0.1,0.9,17)[tmp_kappa_list.index(np.max(tmp_kappa_list))]
    accur_threshold = np.linspace(0.1,0.9,17)[tmp_accur_list.index(np.max(tmp_accur_list))]
    f1_threshold = np.linspace(0.1,0.9,17)[tmp_f1_list.index(np.max(tmp_f1_list))]
    cm_threshold = np.linspace(0.1,0.9,17)[tmp_cm_list.index(np.max(tmp_cm_list))]
    val_results_dict["threshold_kap{}".format(suffix)].append(round(kappa_threshold, 2))
    val_results_dict["threshold_acc{}".format(suffix)].append(round(accur_threshold, 2))
    val_results_dict["threshold_f1{}".format(suffix)].append(round(f1_threshold, 2))
    val_results_dict["threshold_cm{}".format(suffix)].append(round(cm_threshold, 2))
    return val_results_dict

Source File: TrainClassifierEnsemble.py From kaggle-rsna18 with MIT License

5 votes

def calculate_metrics(val_results_dict, y_pred, y_val, suffix=""): 
    tmp_kappa_list = []
    tmp_accur_list = [] 
    tmp_f1_list = [] 
    tmp_cm_list = []
    y_val = utils.to_categorical(y_val)[:,-1]
    for each_threshold in np.linspace(0.1, 0.9, 17): 
        tmp_pred = [1 if _ >= each_threshold else 0 for _ in y_pred]
        tmp_kappa_list.append(cohen_kappa_score(tmp_pred, y_val))
        tmp_accur_list.append(accuracy_score(tmp_pred, y_val)) 
        tmp_f1_list.append(f1_score(tmp_pred, y_val))
        tmp_cm_list.append(competitionMetric(tmp_pred, y_val))
    auroc = round(roc_auc_score(y_val, y_pred), 3)
    kappa = round(np.max(tmp_kappa_list), 3)
    accur = round(np.max(tmp_accur_list), 3) 
    cm = round(np.max(tmp_cm_list), 3)
    f1 = round(np.max(tmp_f1_list), 3) 
    val_results_dict["auc{}".format(suffix)].append(auroc)
    val_results_dict["kap{}".format(suffix)].append(kappa)
    val_results_dict["acc{}".format(suffix)].append(accur) 
    val_results_dict["f1{}".format(suffix)].append(f1) 
    val_results_dict["cm{}".format(suffix)].append(cm)
    kappa_threshold = np.linspace(0.1,0.9,17)[tmp_kappa_list.index(np.max(tmp_kappa_list))]
    accur_threshold = np.linspace(0.1,0.9,17)[tmp_accur_list.index(np.max(tmp_accur_list))]
    f1_threshold = np.linspace(0.1,0.9,17)[tmp_f1_list.index(np.max(tmp_f1_list))]
    cm_threshold = np.linspace(0.1,0.9,17)[tmp_cm_list.index(np.max(tmp_cm_list))]
    val_results_dict["threshold_kap{}".format(suffix)].append(round(kappa_threshold, 2))
    val_results_dict["threshold_acc{}".format(suffix)].append(round(accur_threshold, 2))
    val_results_dict["threshold_f1{}".format(suffix)].append(round(f1_threshold, 2))
    val_results_dict["threshold_cm{}".format(suffix)].append(round(cm_threshold, 2))
    return val_results_dict

Source File: mymetrics.py From hyperspectral_deeplearning_review with GNU General Public License v3.0

5 votes

def reports(y_pred, y_test):
    classification = classification_report(y_test, y_pred)
    oa = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    each_acc, aa = AA_andEachClassAccuracy(confusion)
    kappa = cohen_kappa_score(y_test, y_pred)
    return classification, confusion, np.array([oa, aa, kappa] + list(each_acc)) * 100

Source File: TransferLearning_ffd.py From Intelligent-Projects-Using-Python with MIT License

5 votes

def main(self):
        start_time = time.time()
        print('Data Processing..')
        self.num_class = len(self.class_folders)
        model_to_store_path,class_dict = self.train_model(self.train_dir,self.val_dir,n_fold=self.folds,batch_size=self.batch_size,
                                                        epochs=self.epochs,dim=self.dim,lr=self.lr,model=self.model)
        print("Model saved to dest:",model_to_store_path)

        # Validatione evaluate results
        
        folder_path = Path(f'{self.val_dir}')
        val_results_df = self.inference(model_to_store_path,folder_path,class_dict,self.dim)
        val_results_path = f'{self.outdir}/val_results.csv'
        val_results_df.to_csv(val_results_path,index=False)
        print(f'Validation results saved at : {val_results_path}') 
        pred_class_index = np.array(val_results_df['pred_class_index'].values)
        actual_class_index = np.array(val_results_df['actual_class_index'].values)
        print(pred_class_index)
        print(actual_class_index)
        accuracy = np.mean(actual_class_index == pred_class_index)
        kappa = cohen_kappa_score(pred_class_index,actual_class_index,weights='quadratic')
        #print("-----------------------------------------------------")
        print(f'Validation Accuracy: {accuracy}')
        print(f'Validation Quadratic Kappa Score: {kappa}')
        #print("-----------------------------------------------------")
        #print("Processing Time",time.time() - start_time,' secs')

Source File: TransferLearning.py From Intelligent-Projects-Using-Python with MIT License

5 votes

def inference_validation(self,test_X,test_y,model_save_dest,n_class=5,folds=5):
		pred = np.zeros((len(test_X),n_class))

		for k in range(1,folds + 1):
			model = keras.models.load_model(model_save_dest[k])
			pred = pred + model.predict(test_X)
		pred = pred/(1.0*folds) 
		pred_class = np.argmax(pred,axis=1) 
		act_class = np.argmax(test_y,axis=1)
		accuracy = np.sum([pred_class == act_class])*1.0/len(test_X)
		kappa = cohen_kappa_score(pred_class,act_class,weights='quadratic')
		return pred_class,accuracy,kappa

Source File: test_classification.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_cohen_kappa():
    # These label vectors reproduce the contingency matrix from Artstein and
    # Poesio (2008), Table 1: np.array([[20, 20], [10, 50]]).
    y1 = np.array([0] * 40 + [1] * 60)
    y2 = np.array([0] * 20 + [1] * 20 + [0] * 10 + [1] * 50)
    kappa = cohen_kappa_score(y1, y2)
    assert_almost_equal(kappa, .348, decimal=3)
    assert_equal(kappa, cohen_kappa_score(y2, y1))

    # Add spurious labels and ignore them.
    y1 = np.append(y1, [2] * 4)
    y2 = np.append(y2, [2] * 4)
    assert_equal(cohen_kappa_score(y1, y2, labels=[0, 1]), kappa)

    assert_almost_equal(cohen_kappa_score(y1, y1), 1.)

    # Multiclass example: Artstein and Poesio, Table 4.
    y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
    y2 = np.array([0] * 52 + [1] * 32 + [2] * 16)
    assert_almost_equal(cohen_kappa_score(y1, y2), .8013, decimal=4)

    # Weighting example: none, linear, quadratic.
    y1 = np.array([0] * 46 + [1] * 44 + [2] * 10)
    y2 = np.array([0] * 50 + [1] * 40 + [2] * 10)
    assert_almost_equal(cohen_kappa_score(y1, y2), .9315, decimal=4)
    assert_almost_equal(cohen_kappa_score(y1, y2,
                        weights="linear"), 0.9412, decimal=4)
    assert_almost_equal(cohen_kappa_score(y1, y2,
                        weights="quadratic"), 0.9541, decimal=4)

Source File: metrics_utils.py From ludwig with Apache License 2.0

5 votes

def kappa_score(self):
        return metrics.cohen_kappa_score(self.conditions, self.predictions)

Source File: metrics.py From kaggle-aptos2019-blindness-detection with MIT License

5 votes

def quadratic_weighted_kappa(y_pred, y_true):
    if torch.is_tensor(y_pred):
        y_pred = y_pred.data.cpu().numpy()
    if torch.is_tensor(y_true):
        y_true = y_true.data.cpu().numpy()
    if y_pred.shape[1] == 1:
        y_pred = y_pred[:, 0]
    else:
        y_pred = np.argmax(y_pred, axis=1)
    return metrics.cohen_kappa_score(y_pred, y_true, weights='quadratic')

Source File: test_unet.py From eye-in-the-sky with Apache License 2.0

4 votes

def conf_matrix(Y_gt, Y_pred, num_classes = 9):
    
    total_pixels = 0
    kappa_sum = 0
    sudo_confusion_matrix = np.zeros((num_classes, num_classes))
   
#    if len(Y_pred.shape) == 3:
#        h,w,c = Y_pred.shape
#        Y_pred = np.reshape(Y_pred, (1,))
 
    n = len(Y_pred)
    
    for i in range(n):
        y_pred = Y_pred[i]
        y_gt = Y_gt[i]
        
        #y_pred_hotcode = hotcode(y_pred)
        #y_gt_hotcode = hotcode(y_gt)
        
        pred = np.reshape(y_pred, (y_pred.shape[0]*y_pred.shape[1], y_pred.shape[2]))
        gt = np.reshape(y_gt, (y_gt.shape[0]*y_gt.shape[1], y_gt.shape[2]))
        
        pred = [i for i in pred]
        gt = [i for i in gt]
        
        pred = to_class_no(pred)
        gt = to_class_no(gt)
        
#        pred.tolist()
#        gt.tolist()

        gt = np.asarray(gt, dtype = 'int32')
        pred = np.asarray(pred, dtype = 'int32')

        conf_matrix = confusion_matrix(gt, pred, labels=[0,1,2,3,4,5,6,7,8])
        
        kappa = cohen_kappa_score(gt,pred, labels=[0,1,2,3,4,5,6,7])

        pixels = len(pred)
        total_pixels = total_pixels+pixels
        
        sudo_confusion_matrix = sudo_confusion_matrix + conf_matrix
        
        kappa_sum = kappa_sum + kappa

    final_confusion_matrix = sudo_confusion_matrix
    
    final_kappa = kappa_sum/n

    return final_confusion_matrix, final_kappa

Python sklearn.metrics.cohen_kappa_score() Examples