Python Examples of torchtext.data.LabelField

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos

Source File: tc.py From torchtest with GNU General Public License v3.0

5 votes

def load_data(batch_size=32):
  # define a tokenizer
  # tokenize = lambda s : nltk.word_tokenize(s)
  tokenize = lambda s : s.split()
  # fields : ( text_field, label_field )
  print(':: creating fields')
  text_field = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
  #text_field  = data.Field(sequential=True, tokenize=tokenize, lower=True)
  label_field = data.LabelField(sequential=False)
  # get IMDB data
  print(':: fetching IMDB data')
  train_data, test_data = datasets.IMDB.splits(text_field, label_field) 
  # build vocabulary for fields
  text_field.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
  label_field.build_vocab(train_data)

  # split train into train and valid
  train_data, valid_data = train_data.split() 

  print(':: labels :', label_field.vocab.stoi)

  # iterators
  train_iter, test_iter, valid_iter = data.BucketIterator.splits( 
                  (train_data, test_data, valid_data), 
                  batch_size=batch_size, 
                  sort_key=lambda x : len(x.text),
                  repeat=False,
                  shuffle=True)

  return  ( (text_field, label_field), (train_iter, test_iter, valid_iter), 
      text_field.vocab.vectors, # GloVe vectors 
      len(text_field.vocab)
        )

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_init(self):
        # basic init
        label_field = data.LabelField()
        assert label_field.sequential is False
        assert label_field.unk_token is None

        # init with preset fields
        label_field = data.LabelField(sequential=True, unk_token="<unk>")
        assert label_field.sequential is False
        assert label_field.unk_token is None

Source File: load_data.py From Text-Classification-Pytorch with MIT License

5 votes

def load_dataset(test_sen=None):

    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    
    """
    
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter

Source File: dataset_reader.py From nlp-experiments-in-pytorch with MIT License

5 votes

def create_fields(self, seq_input=True, seq_ner=True, seq_cat=False):
        if self.level == "word":
            sentence_field = data.Field(sequential=seq_input, preprocessing=self.preprocessor, fix_length=self.fix_length,
                                        init_token="<start>", eos_token="<end>")
        elif self.level == "char":
            sentence_field = data.Field(sequential=seq_input, tokenize=self.evil_workaround_tokenizer, fix_length=1014)
            # sentence_field = data.NestedField(nested_field)
        else:
            raise KeyError("Sentence_field is undefined!")

        ner_label_field = data.Field(sequential=seq_ner, init_token="<start>", eos_token="<end>", unk_token=None)
        category_label_field = data.LabelField(sequential=seq_cat)
        return sentence_field, ner_label_field, category_label_field

Source File: dataset.py From pytorch-sentiment-analysis-classification with MIT License

5 votes

def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)

Source File: imdb.py From vel with MIT License

5 votes

def create(model_config, batch_size, vectors=None):
    """ Create an IMDB dataset """
    path = model_config.data_dir('imdb')

    text_field = data.Field(lower=True, tokenize='spacy', batch_first=True)
    label_field = data.LabelField(is_target=True)

    train_source, test_source = IMDBCached.splits(
        root=path,
        text_field=text_field,
        label_field=label_field
    )

    text_field.build_vocab(train_source, max_size=25_000, vectors=vectors)
    label_field.build_vocab(train_source)

    train_iterator, test_iterator = data.BucketIterator.splits(
        (train_source, test_source),
        batch_size=batch_size,
        device=model_config.torch_device(),
        shuffle=True
    )

    return TextData(
        train_source, test_source, train_iterator, test_iterator, text_field, label_field
    )

Python torchtext.data.LabelField() Examples