Python sklearn.preprocessing.MultiLabelBinarizer() Examples
The following are 30
code examples of sklearn.preprocessing.MultiLabelBinarizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.preprocessing
, or try the search function
.
Example #1
Source File: evaluation.py From EUSIPCO2017 with GNU Affero General Public License v3.0 | 7 votes |
def __init__(self, model_module, weights_path, evaluation_strategy="s2"): """ Test metadata format --------------------- filename : string class_ids: string of ints with space as a delimiter """ test_dataset = pd.read_csv(IRMAS_TESTING_META_PATH, names=["filename", "class_ids"]) self.X = list(test_dataset.filename) targets = [[int(category) for category in target.split()] for target in test_dataset.class_ids] self.ml_binarizer = MultiLabelBinarizer().fit(targets) self.y_true = self.ml_binarizer.transform(targets) self.y_pred = np.zeros(shape=self.y_true.shape) self.y_pred_raw = np.zeros(shape=self.y_true.shape) self.y_pred_raw_average = np.zeros(shape=self.y_true.shape) self.model_module = model_module self.weights_path = weights_path self.feature_filenames = os.listdir(os.path.join(IRMAS_TEST_FEATURE_BASEPATH, model_module.BASE_NAME)) self.dataset_mean = np.load(os.path.join(MODEL_MEANS_BASEPATH, "{}_mean.npy".format(model_module.BASE_NAME))) self.evaluation_strategy = evaluation_strategy self.thresholds_s1 = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20, 0.22, 0.24] self.thresholds_s2 = [0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
Example #2
Source File: data.py From keras-text with MIT License | 6 votes |
def __init__(self, inputs, labels, test_indices=None, **kwargs): """Encapsulates all pieces of data to run an experiment. This is basically a bag of items that makes it easy to serialize and deserialize everything as a unit. Args: inputs: The raw model inputs. This can be set to None if you dont want to serialize this value when you save the dataset. labels: The raw output labels. test_indices: The optional test indices to use. Ideally, this should be generated one time and reused across experiments to make results comparable. `generate_test_indices` can be used generate first time indices. **kwargs: Additional key value items to store. """ self.X = np.array(inputs) self.y = np.array(labels) for key, value in kwargs.items(): setattr(self, key, value) self._test_indices = None self._train_indices = None self.test_indices = test_indices self.is_multi_label = isinstance(labels[0], (set, list, tuple)) self.label_encoder = MultiLabelBinarizer() if self.is_multi_label else LabelBinarizer() self.y = self.label_encoder.fit_transform(self.y).flatten()
Example #3
Source File: _machine_learning.py From qlik-py-tools with MIT License | 6 votes |
def text_similarity(df, col): """ Convert strings to their unicode representation and then apply one hot encoding, creating one feature for each unique character in the column. This can be useful when similarity between strings is significant. """ unique = pd.DataFrame(df[col].unique(), columns=[col]) encoded = pd.DataFrame(unique.loc[:,col].apply(lambda s: [ord(a) for a in s]), index=unique.index) mlb = preprocessing.MultiLabelBinarizer() encoded = pd.DataFrame(mlb.fit_transform(encoded[col]),columns=mlb.classes_, index=encoded.index).add_prefix(col+"_") unique = unique.join(encoded) return unique.set_index(col)
Example #4
Source File: default_processor.py From text2vec with Apache License 2.0 | 6 votes |
def _build_label_dict(self, labels: List[str]): from sklearn.preprocessing import MultiLabelBinarizer if self.multi_label: label_set = set() for i in labels: label_set = label_set.union(list(i)) else: label_set = set(labels) self.label2idx = {} for idx, label in enumerate(sorted(label_set)): self.label2idx[label] = len(self.label2idx) self.idx2label = dict([(value, key) for key, value in self.label2idx.items()]) self.dataset_info['label_count'] = len(self.label2idx) self.multi_label_binarizer = MultiLabelBinarizer(classes=list(self.label2idx.keys()))
Example #5
Source File: train.py From stacks-usecase with Apache License 2.0 | 6 votes |
def feature_vectorizer(X_train, X_test, y_train, y_test): """prepare X data with tfidf and y with multi label binarizer""" vectorizer = TfidfVectorizer( analyzer="word", min_df=0.0, max_df=1.0, strip_accents=None, encoding="utf-8", preprocessor=None, token_pattern=r"(?u)\S\S+", max_features=1000, ) # fit only training data vectorizer.fit(X_train) _save_data(vectorizer, "/workdir/models/X_vectorizer.pk") X_train_features = vectorizer.transform(X_train) X_test_features = vectorizer.transform(X_test) # use multiLabelBinarizer to create one-hot encoding of labels for y data mlb = MultiLabelBinarizer() # fit only training data mlb.fit(y_train) _save_data(mlb, "/workdir/models/label_binarizer.pk") y_train_features = mlb.transform(y_train) y_test_features = mlb.transform(y_test) return X_train_features, X_test_features, y_train_features, y_test_features
Example #6
Source File: pipeline.py From image-captioning-for-mortals with BSD 3-Clause "New" or "Revised" License | 6 votes |
def prepVect(min_df=2, max_features=50000, n_captions=5, n_sbu=None, multilabel=False): print "prepping the Word Tokenizer..." _0, _1, trY, _3 = coco(mode='full', n_captions=n_captions) if n_sbu: _4, sbuY, _5 = sbuXYFilenames(n_sbu) trY.extend(sbuY) vect = Tokenizer(min_df=min_df, max_features=max_features) captions = sampleCaptions(trY, n_captions) vect.fit(captions) if multilabel: mlb = MultiLabelBinarizer() mlb.fit(vect.transform(captions)) return vect, mlb # if not multilabel: return vect
Example #7
Source File: char2ir_gpu.py From plastering with MIT License | 6 votes |
def evaluate(self, preds): acc = eval_func.sequential_accuracy( [self.label_dict[srcid] for srcid in preds.keys()], [preds[srcid] for srcid in preds.keys()]) pred = [preds[srcid] for srcid in preds.keys()] true = [self.label_dict[srcid] for srcid in preds.keys()] mlb = MultiLabelBinarizer() mlb.fit(pred + true) encoded_true = mlb.transform(true) encoded_pred = mlb.transform(pred) macro_f1 = f1_score(encoded_true, encoded_pred, average='macro') f1 = f1_score(encoded_true, encoded_pred, average='weighted') res = { 'accuracy': acc, 'f1': f1, 'macro_f1': macro_f1 } return res
Example #8
Source File: test_preprocessing.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.preprocessing.Binarizer, pp.Binarizer) self.assertIs(df.preprocessing.FunctionTransformer, pp.FunctionTransformer) self.assertIs(df.preprocessing.Imputer, pp.Imputer) self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer) self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer) self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder) self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer) self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler) self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler) self.assertIs(df.preprocessing.Normalizer, pp.Normalizer) self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder) self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures) self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler) self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler)
Example #9
Source File: evaluation.py From BioNEV with MIT License | 6 votes |
def NodeClassification(embedding_look_up, node_list, labels, testing_ratio, seed): X_train, y_train, X_test, y_test = split_train_test_classify(embedding_look_up, node_list, labels, testing_ratio=testing_ratio,seed=seed) binarizer = MultiLabelBinarizer(sparse_output=True) y_all = np.append(y_train, y_test) binarizer.fit(y_all) y_train = binarizer.transform(y_train).todense() y_test = binarizer.transform(y_test).todense() model = OneVsRestClassifier(LogisticRegression(random_state=seed, solver='lbfgs')) model.fit(X_train, y_train) y_pred_prob = model.predict_proba(X_test) ## small trick : we assume that we know how many label to predict y_pred = get_y_pred(y_test, y_pred_prob) accuracy = accuracy_score(y_test, y_pred) micro_f1 = f1_score(y_test, y_pred, average="micro") macro_f1 = f1_score(y_test, y_pred, average="macro") print('#' * 9 + ' Node Classification Performance ' + '#' * 9) print(f'Accuracy: {accuracy:.3f}, Micro-F1: {micro_f1:.3f}, Macro-F1: {macro_f1:.3f}') print('#' * 50) return accuracy, micro_f1, macro_f1
Example #10
Source File: test_discovery.py From fairtest with Apache License 2.0 | 6 votes |
def setUp(self): FILENAME = "../data/images/overfeat_raw.txt" data = prepare.data_from_csv(FILENAME, sep='\\t') TARGET = 'Labels' self.SENS = ['Race'] self.EXPL = [] labeled_data = [ast.literal_eval(s) for s in data[TARGET]] for l in labeled_data: assert len(l) == 5 label_encoder = preprocessing.MultiLabelBinarizer() labeled_data = label_encoder.fit_transform(labeled_data) labels = label_encoder.classes_ df_labels = pd.DataFrame(labeled_data, columns=labels) self.data = DataSource(pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1)) self.TARGET = labels.tolist()
Example #11
Source File: feature_expansion.py From KDDCup2019_admin with MIT License | 6 votes |
def cat_onehot_encoder_m(df,y,col,selection=True): ## ZJN: test raise memory error # raise MemoryError mlbs = MultiLabelBinarizer(sparse_output=True).fit(df.values) from scipy.sparse import csr_matrix features_tmp = mlbs.transform(df.values) features_tmp = csr_matrix(features_tmp,dtype=float).tocsr() models = None auc_score = None if selection is True: auc_score, models = train_lightgbm_for_feature_selection(features_tmp, y) print(col, "auc", auc_score) #new_feature = pd.DataFrame(features_tmp,columns=["mul_feature_"+col]) new_feature = features_tmp from scipy.sparse import hstack return new_feature,mlbs,models,auc_score
Example #12
Source File: data_helper.py From HFT-CNN with MIT License | 6 votes |
def build_input_label_data(labels, class_order): from sklearn.preprocessing import MultiLabelBinarizer from itertools import chain bml = MultiLabelBinarizer(classes=class_order, sparse_output=True) indexes = sp.find(bml.fit_transform(labels)) y = [] for i in range(len(labels)): y.append([]) for i,j in zip(indexes[0], indexes[1]): y[i].append(j) return y # padding operation # =========================================================
Example #13
Source File: multi_class_svm.py From JusticeAI with MIT License | 5 votes |
def __classify_precedent(self, precedent): """ 1) The data looks as such: [1, 1, 1, 0, 1, 0, 0, 1...] 2) We must create a new list with only the index of the columns where there are values of '1'. This is necessary because the sklearn algorithm expects this kind of input. 3) Reshape the y data The MultiLabelBinarizer expects a series of labels for binarization. From all the collected labels, it finds all the unique ones in order to figure out how many columns are needed in the vector. From this, it will place 1's and 0's accordingly in the columns. For this purpose, we cannot create a binarized vector here but instead we return the labels which are true for an outcome. Example: (transformation) [1, 1, 0, 0, 1] ------------------> [0, 1, 4] 4) Create a 2D numpy array from the new list:[ [precedent #1 outcomes], [precedent #2 outcomes], ... ] :param precedent: dict{ 'facts_vector': [], 'outcomes_vector': [], 'demands_vector': [] } :return: np.array([0, 1, 4, ...]) """ classified_precedent = [] outcome_vector = precedent['outcomes_vector'] for i in range(len(outcome_vector)): if outcome_vector[i] >= 1: classified_precedent.append(i) return classified_precedent
Example #14
Source File: multi_class_svm.py From JusticeAI with MIT License | 5 votes |
def train(self): """ Train a classifier using Linear SVC 1) reshape date in a format that sklearn understands 2) Binarize data for multi output 3) split training data 4) train (fit) 5) test model :return: None """ x_total, y_total = self.reshape_dataset() # 1 self.mlb = MultiLabelBinarizer() # 2 y_total = self.mlb.fit_transform(y_total) x_train, x_test, y_train, y_test = train_test_split( x_total, y_total, test_size=0.20, random_state=42) # 3 Log.write("Sample size: {}".format(len(x_total))) Log.write("Train size: {}".format(len(x_train))) Log.write("Test size: {}".format(len(x_test))) Log.write("Training Classifier Using Multi Class SVM") clf = OneVsRestClassifier(SVC(kernel='linear', random_state=42, probability=True)) # 4 clf.fit(x_train, y_train) self.model = clf self.__test(x_test, y_test) # 5
Example #15
Source File: preprocessing.py From arxiv-twitterbot with MIT License | 5 votes |
def get_features_matrix(df, min_author_freq=3, min_term_freq=30, ngram_range=(1, 3)): """Return numpy array of data for sklearn models""" text = [title + ' ' + summary for title, summary in zip(df.title.values, df.summary.values)] vectorizer = TfidfVectorizer(min_df=min_term_freq, stop_words='english', ngram_range=ngram_range) text_features = vectorizer.fit_transform(text).toarray() author_counts = pd.Series([a for author_set in df.authors.values for a in author_set]).value_counts() allowed_authors = author_counts[author_counts >= min_author_freq].index filtered_authors = df.authors.apply(lambda authors: [a for a in authors if a in allowed_authors]) author_binarizer = MultiLabelBinarizer() author_features = author_binarizer.fit_transform(filtered_authors.values) category_dummies = pd.get_dummies(df.category) category_features = category_dummies.values all_features = [text_features, author_features, category_features] x = np.concatenate(all_features, axis=1) if 'tweeted' in df: y = df.tweeted.astype(int).values else: y = None feature_names = np.concatenate((vectorizer.get_feature_names(), category_dummies.columns.values, author_binarizer.classes_)) return x, y, feature_names
Example #16
Source File: optimization.py From sports-betting with MIT License | 5 votes |
def calculate_yields(score1, score2, bets, odds, targets): """Calculate the yields.""" # Check odds odds = check_array(odds) targets = check_array(targets, dtype=object, ensure_2d=False) # Generate yields bets = MultiLabelBinarizer(classes=['-'] + targets.tolist()).fit_transform([[bet] for bet in bets])[:, 1:] yields = ((extract_multi_labels(score1, score2, targets) * odds - 1.0) * bets).sum(axis=1) return yields
Example #17
Source File: classify.py From BioNEV with MIT License | 5 votes |
def __init__(self, vectors, clf): self.embeddings = vectors self.clf = TopKRanker(clf) self.binarizer = MultiLabelBinarizer(sparse_output=True)
Example #18
Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_BRKnnb_auto_optimize_k(self): data = csr.csr_matrix([[0, 1], [1, 1], [0, 1.1], [1.1, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid2', 'lid3'], ['lid0', 'lid1']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbor_candidates=[1, 3], auto_optimize_k=True) # noinspection PyUnusedLocal def fun(s, X, y_): return data[[1, 2, 3]], data[[0]], y[[1, 2, 3]], y[[0]] BRKNeighborsClassifier._get_split = fun knn.fit(data, y) self.assertEquals(3, knn.n_neighbors) pred = knn.predict(csr.csr_matrix([[0.1, 1], [2, 2]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0], [1, 1, 0, 0]], pred) # def test_time_brknnb(self): # times = [] # X = sp.rand(10000, 5000, density=0.005, format='csr') # y = sp.rand(10000, 3000, density=0.005, format='csr') # knn = BRKNeighborsClassifier(n_neighbors=100) # knn.fit(X,y) # X_test = sp.rand(1000, 5000, density=0.005, format ='csr') # for _ in range(5): # start = default_timer() # knn.predict(X_test) # times.append(default_timer() - start) # print(np.mean(times))
Example #19
Source File: multiclass.py From sk-dist with Apache License 2.0 | 5 votes |
def fit(self, X, y, **fit_params): """ Fit underlying estimators. Parallelize fit operation using spark. Args: X (array-like, shape = [n_samples, n_features]): input data y (array-like, shape = [n_samples, ], [n_samples, n_classes]): multi-class targets **fit_params (dict of string -> object): parameters passed to the ``fit`` method of the estimator """ _check_estimator(self, verbose=self.verbose) if (not self.mlb_override and not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) and not isinstance(y[0], str)): self.mlb = MultiLabelBinarizer() y = self.mlb.fit_transform(y) if isinstance(X, pd.DataFrame): X.index = list(range(len(X))) self.label_binarizer_ = LabelBinarizer(sparse_output=True) self.label_binarizer_.fit(y) self.classes_ = self.label_binarizer_.classes_ self._fit(X, y, **fit_params) del self.sc if hasattr(self.estimator, "sc"): del self.estimator.sc return self
Example #20
Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_BRKnnb_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=False) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
Example #21
Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_BRKnnb_predict(self): data = csr.csr_matrix([[0, 1], [1, 1], [1.5, 1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid0', 'lid1'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(mode='b', n_neighbors=3) knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[0, 1]])).todense() np.testing.assert_array_equal([[1, 1, 0, 0, 0]], pred)
Example #22
Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_BRKnna_predict_dense(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer() y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
Example #23
Source File: modularity_explicitness.py From disentanglement_lib with Apache License 2.0 | 5 votes |
def explicitness_per_factor(mus_train, y_train, mus_test, y_test): """Compute explicitness score for a factor as ROC-AUC of a classifier. Args: mus_train: Representation for training, (num_codes, num_points)-np array. y_train: Ground truth factors for training, (num_factors, num_points)-np array. mus_test: Representation for testing, (num_codes, num_points)-np array. y_test: Ground truth factors for testing, (num_factors, num_points)-np array. Returns: roc_train: ROC-AUC score of the classifier on training data. roc_test: ROC-AUC score of the classifier on testing data. """ x_train = np.transpose(mus_train) x_test = np.transpose(mus_test) clf = LogisticRegression().fit(x_train, y_train) y_pred_train = clf.predict_proba(x_train) y_pred_test = clf.predict_proba(x_test) mlb = MultiLabelBinarizer() roc_train = roc_auc_score(mlb.fit_transform(np.expand_dims(y_train, 1)), y_pred_train) roc_test = roc_auc_score(mlb.fit_transform(np.expand_dims(y_test, 1)), y_pred_test) return roc_train, roc_test
Example #24
Source File: test_BRKNN.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_BRKnna_predict(self): data = csr.csr_matrix([[0, 1], [1, 1], [1, 1.1], [0.5, 1]]) train_ids = [['lid0', 'lid1'], ['lid2', 'lid3'], ['lid4', 'lid3'], ['lid4', 'lid5']] mlb = MultiLabelBinarizer(sparse_output=True) y = mlb.fit_transform(train_ids) knn = BRKNeighborsClassifier(threshold=0.5, n_neighbors=3, mode='a') knn.fit(data, y) pred = knn.predict(csr.csr_matrix([[1.1, 1.1]])).todense() np.testing.assert_array_equal([[0, 0, 0, 1, 1, 0]], pred)
Example #25
Source File: eval_func.py From plastering with MIT License | 5 votes |
def binarize_labels(true_labels, pred_labels, excluding_labels=[]): excluding_labels = ['building-ebu3b'] srcids = list(pred_labels.keys()) tot_labels = [[label for label in labels if label not in excluding_labels] for labels in list(pred_labels.values()) + list(true_labels.values()) ] mlb = MultiLabelBinarizer().fit(tot_labels) pred_mat = mlb.transform(pred_labels.values()) true_mat = mlb.transform(true_labels.values()) return true_mat, pred_mat
Example #26
Source File: classify.py From OpenNE with MIT License | 5 votes |
def __init__(self, vectors, clf): self.embeddings = vectors self.clf = TopKRanker(clf) self.binarizer = MultiLabelBinarizer(sparse_output=True)
Example #27
Source File: test_multilabel_realdata.py From libact with BSD 2-Clause "Simplified" License | 5 votes |
def setUp(self): dataset_filepath = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'datasets/yeast_train.svm') X, y = load_svmlight_file(dataset_filepath, multilabel=True) self.X = X.todense().tolist() self.y = MultiLabelBinarizer().fit_transform(y).tolist() self.quota = 10
Example #28
Source File: discovery.py From fairtest with Apache License 2.0 | 5 votes |
def main(argv=sys.argv): if len(argv) != 1: usage(argv) FILENAME = "../../../data/recommender/recommendations.txt" OUTPUT_DIR = "." data = prepare.data_from_csv(FILENAME, sep='\\t', to_drop=['RMSE', 'Avg Movie Age', 'Avg Recommended Rating', 'Avg Seen Rating', 'Occupation']) TARGET = 'Types' SENS = ['Gender'] EXPL = [] labeled_data = [ast.literal_eval(s) for s in data[TARGET]] for labels in labeled_data: assert len(labels) == 5 label_encoder = preprocessing.MultiLabelBinarizer() labeled_data = label_encoder.fit_transform(labeled_data) labels = label_encoder.classes_ df_labels = pd.DataFrame(labeled_data, columns=labels) data = pd.concat([data.drop(TARGET, axis=1), df_labels], axis=1) TARGET = labels.tolist() data_source = DataSource(data) # Instantiate the experiment inv = Discovery(data_source, SENS, TARGET, EXPL, topk=10, random_state=0) # Train the classifier train([inv]) # Evaluate on the testing set test([inv]) # Create the report report([inv], "discovery", OUTPUT_DIR)
Example #29
Source File: classify.py From GraphEmbedding with MIT License | 5 votes |
def __init__(self, embeddings, clf): self.embeddings = embeddings self.clf = TopKRanker(clf) self.binarizer = MultiLabelBinarizer(sparse_output=True)
Example #30
Source File: preprocess.py From gnn-benchmark with MIT License | 5 votes |
def binarize_labels(labels, sparse_output=False, return_classes=False): """Convert labels vector to a binary label matrix. In the default single-label case, labels look like labels = [y1, y2, y3, ...]. Also supports the multi-label format. In this case, labels should look something like labels = [[y11, y12], [y21, y22, y23], [y31], ...]. Parameters ---------- labels : array-like, shape [num_samples] Array of node labels in categorical single- or multi-label format. sparse_output : bool, default False Whether return the label_matrix in CSR format. return_classes : bool, default False Whether return the classes corresponding to the columns of the label matrix. Returns ------- label_matrix : np.ndarray or sp.csr_matrix, shape [num_samples, num_classes] Binary matrix of class labels. num_classes = number of unique values in "labels" array. label_matrix[i, k] = 1 <=> node i belongs to class k. classes : np.array, shape [num_classes], optional Classes that correspond to each column of the label_matrix. """ if hasattr(labels[0], '__iter__'): # labels[0] is iterable <=> multilabel format binarizer = MultiLabelBinarizer(sparse_output=sparse_output) else: binarizer = LabelBinarizer(sparse_output=sparse_output) label_matrix = binarizer.fit_transform(labels).astype(np.float32) return (label_matrix, binarizer.classes_) if return_classes else label_matrix