Python torchtext.datasets() Examples
The following are 2
code examples of torchtext.datasets().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torchtext
, or try the search function
.
Example #1
Source File: data_loader_txt.py From char-cnn-text-classification-pytorch with Apache License 2.0 | 5 votes |
def sst(text_field, label_field, batch_size, **kargs): train_data, dev_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) text_field.build_vocab(train_data, dev_data, test_data) label_field.build_vocab(train_data, dev_data, test_data) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train_data, dev_data, test_data), batch_sizes=(batch_size, len(dev_data), len(test_data)), **kargs) return train_iter, dev_iter, test_iter # load MR dataset
Example #2
Source File: data_loader_txt.py From char-cnn-text-classification-pytorch with Apache License 2.0 | 4 votes |
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with codecs.open(os.path.join(path, 'rt-polarity.neg'), encoding='utf-8', errors='ignore') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f] with codecs.open(os.path.join(path, 'rt-polarity.pos'), encoding='utf-8', errors='ignore') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f] super(MR, self).__init__(examples, fields, **kwargs)