Python torchtext.data.LabelField() Examples
The following are 7
code examples of torchtext.data.LabelField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torchtext.data
, or try the search function
.
Example #1
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_vocab_size(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.LabelField() # Copied from test_build_vocab with minor changes # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Skipping json dataset as we can rely on the original build vocab test label_field.build_vocab(tsv_dataset) assert label_field.vocab.freqs == Counter({'1': 2, '0': 1}) expected_stoi = {'1': 0, '0': 1} # No <unk> assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert label_field.vocab.itos == expected_itos
Example #2
Source File: tc.py From torchtest with GNU General Public License v3.0 | 5 votes |
def load_data(batch_size=32): # define a tokenizer # tokenize = lambda s : nltk.word_tokenize(s) tokenize = lambda s : s.split() # fields : ( text_field, label_field ) print(':: creating fields') text_field = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) #text_field = data.Field(sequential=True, tokenize=tokenize, lower=True) label_field = data.LabelField(sequential=False) # get IMDB data print(':: fetching IMDB data') train_data, test_data = datasets.IMDB.splits(text_field, label_field) # build vocabulary for fields text_field.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) label_field.build_vocab(train_data) # split train into train and valid train_data, valid_data = train_data.split() print(':: labels :', label_field.vocab.stoi) # iterators train_iter, test_iter, valid_iter = data.BucketIterator.splits( (train_data, test_data, valid_data), batch_size=batch_size, sort_key=lambda x : len(x.text), repeat=False, shuffle=True) return ( (text_field, label_field), (train_iter, test_iter, valid_iter), text_field.vocab.vectors, # GloVe vectors len(text_field.vocab) )
Example #3
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_init(self): # basic init label_field = data.LabelField() assert label_field.sequential is False assert label_field.unk_token is None # init with preset fields label_field = data.LabelField(sequential=True, unk_token="<unk>") assert label_field.sequential is False assert label_field.unk_token is None
Example #4
Source File: load_data.py From Text-Classification-Pytorch with MIT License | 5 votes |
def load_dataset(test_sen=None): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ tokenize = lambda x: x.split() TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = data.LabelField(tensor_type=torch.FloatTensor) train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors print ("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print ("Label Length: " + str(len(LABEL.vocab))) train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) '''Alternatively we can also use the default configurations''' # train_iter, test_iter = datasets.IMDB.iters(batch_size=32) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Example #5
Source File: dataset_reader.py From nlp-experiments-in-pytorch with MIT License | 5 votes |
def create_fields(self, seq_input=True, seq_ner=True, seq_cat=False): if self.level == "word": sentence_field = data.Field(sequential=seq_input, preprocessing=self.preprocessor, fix_length=self.fix_length, init_token="<start>", eos_token="<end>") elif self.level == "char": sentence_field = data.Field(sequential=seq_input, tokenize=self.evil_workaround_tokenizer, fix_length=1014) # sentence_field = data.NestedField(nested_field) else: raise KeyError("Sentence_field is undefined!") ner_label_field = data.Field(sequential=seq_ner, init_token="<start>", eos_token="<end>", unk_token=None) category_label_field = data.LabelField(sequential=seq_cat) return sentence_field, ner_label_field, category_label_field
Example #6
Source File: dataset.py From pytorch-sentiment-analysis-classification with MIT License | 5 votes |
def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
Example #7
Source File: imdb.py From vel with MIT License | 5 votes |
def create(model_config, batch_size, vectors=None): """ Create an IMDB dataset """ path = model_config.data_dir('imdb') text_field = data.Field(lower=True, tokenize='spacy', batch_first=True) label_field = data.LabelField(is_target=True) train_source, test_source = IMDBCached.splits( root=path, text_field=text_field, label_field=label_field ) text_field.build_vocab(train_source, max_size=25_000, vectors=vectors) label_field.build_vocab(train_source) train_iterator, test_iterator = data.BucketIterator.splits( (train_source, test_source), batch_size=batch_size, device=model_config.torch_device(), shuffle=True ) return TextData( train_source, test_source, train_iterator, test_iterator, text_field, label_field )