Python torchtext.data.BucketIterator() Examples
The following are 30
code examples of torchtext.data.BucketIterator().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torchtext.data
, or try the search function
.
Example #1
Source File: semantic_similar_data.py From glyce with Apache License 2.0 | 6 votes |
def __init__(self, args): self.RAW = data.RawField() self.RAW.is_target = False tokenize = lambda x: list(x) self.TEXT = data.Field(batch_first=True, tokenize=tokenize) self.LABEL = data.Field(sequential=False, unk_token=None) self.train, self.dev, self.test = data.TabularDataset.splits( path='/data/nfsdata/nlp/datasets/sentence_pair/bq_corpus_torch10', train='BQ_train.json', validation='BQ_dev.json', test='BQ_test.json', format='json', fields={"gold_label": ("label", self.LABEL), "sentence1": ("q1", self.TEXT), "sentence2": ("q2", self.TEXT), "ID": ("id", self.RAW)}) self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=Vectors("BQ300", args.data)) self.LABEL.build_vocab(self.train) sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.train_iter = data.BucketIterator(self.train, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True) self.dev_iter = data.BucketIterator(self.dev, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True) self.test_iter = data.BucketIterator(self.test, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True)
Example #2
Source File: datasets.py From TorchFusion with MIT License | 6 votes |
def json_data_loader(file_path,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,train=True,**args): """ :param file_path: :param fields: :param split_ratio: :param split_seed: :param skip_header: :param save_vocab_path: :param batch_size: :param device: :param train: :param args: :return: """ dataset = load_tabular_set(file_path,"json",fields=fields,split_ratio=split_ratio,split_seed=split_seed,skip_header=skip_header,save_vocab_path=save_vocab_path,**args) return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False)
Example #3
Source File: datasets.py From TorchFusion with MIT License | 6 votes |
def tsv_data_split_loader(root_path,fields,train=None,val=None,test=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,**args): """ :param root_path: :param fields: :param train: :param val: :param test: :param skip_header: :param save_vocab_path: :param batch_size: :param device: :param args: :return: """ dataset = load_tabular_set_split(root_path,"tsv",fields=fields,train=train,val=val,test=test,skip_header=skip_header,save_vocab_path=save_vocab_path,**args) return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False)
Example #4
Source File: datasets.py From TorchFusion with MIT License | 6 votes |
def tsv_data_loader(file_path,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,train=True,**args): """ :param file_path: :param fields: :param split_ratio: :param split_seed: :param skip_header: :param save_vocab_path: :param batch_size: :param device: :param train: :param args: :return: """ dataset = load_tabular_set(file_path,"tsv",fields=fields,split_ratio=split_ratio,split_seed=split_seed,skip_header=skip_header,save_vocab_path=save_vocab_path,**args) return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False)
Example #5
Source File: datasets.py From TorchFusion with MIT License | 6 votes |
def csv_data_split_loader(root_path,fields,train=None,val=None,test=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,**args): """ :param root_path: :param fields: :param train: :param val: :param test: :param skip_header: :param save_vocab_path: :param batch_size: :param device: :param args: :return: """ dataset = load_tabular_set_split(root_path,"csv",fields=fields,train=train,val=val, test=test,skip_header=skip_header,save_vocab_path=save_vocab_path,**args) return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False)
Example #6
Source File: iterators.py From OpenKiwi with GNU Affero General Public License v3.0 | 6 votes |
def build_bucket_iterator(dataset, device, batch_size, is_train): device_obj = None if device is None else torch.device(device) iterator = data.BucketIterator( dataset=dataset, batch_size=batch_size, repeat=False, sort_key=dataset.sort_key, sort=False, # sorts the data within each minibatch in decreasing order # set to true if you want use pack_padded_sequences sort_within_batch=is_train, # shuffle batches shuffle=is_train, device=device_obj, train=is_train, ) return iterator
Example #7
Source File: datasets.py From TorchFusion with MIT License | 6 votes |
def csv_data_loader(file_path,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,train=True,**args): """ :param file_path: :param fields: :param split_ratio: :param split_seed: :param skip_header: :param save_vocab_path: :param batch_size: :param device: :param train: :param args: :return: """ dataset = load_tabular_set(file_path,"csv",fields=fields,split_ratio=split_ratio,split_seed=split_seed,skip_header=skip_header,save_vocab_path=save_vocab_path,**args) return BucketIterator(dataset,batch_size=batch_size,device=device,train=True,shuffle=train,repeat=False)
Example #8
Source File: train.py From attention-is-all-you-need-pytorch with MIT License | 6 votes |
def prepare_dataloaders(opt, device): batch_size = opt.batch_size data = pickle.load(open(opt.data_pkl, 'rb')) opt.max_token_seq_len = data['settings'].max_len opt.src_pad_idx = data['vocab']['src'].vocab.stoi[Constants.PAD_WORD] opt.trg_pad_idx = data['vocab']['trg'].vocab.stoi[Constants.PAD_WORD] opt.src_vocab_size = len(data['vocab']['src'].vocab) opt.trg_vocab_size = len(data['vocab']['trg'].vocab) #========= Preparing Model =========# if opt.embs_share_weight: assert data['vocab']['src'].vocab.stoi == data['vocab']['trg'].vocab.stoi, \ 'To sharing word embedding the src/trg word2idx table shall be the same.' fields = {'src': data['vocab']['src'], 'trg':data['vocab']['trg']} train = Dataset(examples=data['train'], fields=fields) val = Dataset(examples=data['valid'], fields=fields) train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True) val_iterator = BucketIterator(val, batch_size=batch_size, device=device) return train_iterator, val_iterator
Example #9
Source File: preprocessing_funcs.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def load_dataloaders(args): logger.info("Preparing dataloaders...") FR = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>",\ batch_first=True) EN = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, batch_first=True) train_path = os.path.join("./data/", "df.csv") if not os.path.isfile(train_path): tokenize_data(args) train = torchtext.data.TabularDataset(train_path, format="csv", \ fields=[("EN", EN), ("FR", FR)]) FR.build_vocab(train) EN.build_vocab(train) train_iter = BucketIterator(train, batch_size=args.batch_size, repeat=False, sort_key=lambda x: (len(x["EN"]), len(x["FR"])),\ shuffle=True, train=True) train_length = len(train) logger.info("Loaded dataloaders.") return train_iter, FR, EN, train_length
Example #10
Source File: train.py From aivivn-tone with MIT License | 5 votes |
def train_in_parts(self, train_parts, val, val_iterator, batch_size, start_epoch=0, print_every=100): for epoch in range(start_epoch, self.n_epochs): # shuffle data each epoch random.shuffle(train_parts) for train_src_, train_tgt_ in train_parts: # create train dataset print("Training part [{}] with target [{}]...".format(train_src_, train_tgt_)) train_ = Seq2SeqDataset.from_file(train_src_, train_tgt_, share_fields_from=val) # create iterator train_iterator_ = BucketIterator(dataset=train_, batch_size=batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=True, device=device) # train self._train_epoch(epoch, train_iterator_, train=True, print_every=print_every) # clean del train_ del train_iterator_ gc.collect() # save self.save(epoch) # evaluate on validation set after each epoch with torch.no_grad(): self._train_epoch(epoch, val_iterator, train=False, print_every=print_every)
Example #11
Source File: data.py From joeynmt with Apache License 2.0 | 5 votes |
def make_data_iter(dataset: Dataset, batch_size: int, batch_type: str = "sentence", train: bool = False, shuffle: bool = False) -> Iterator: """ Returns a torchtext iterator for a torchtext dataset. :param dataset: torchtext dataset containing src and optionally trg :param batch_size: size of the batches the iterator prepares :param batch_type: measure batch size by sentence count or by token count :param train: whether it's training time, when turned off, bucketing, sorting within batches and shuffling is disabled :param shuffle: whether to shuffle the data before each epoch (no effect if set to True for testing) :return: torchtext iterator """ batch_size_fn = token_batch_size_fn if batch_type == "token" else None if train: # optionally shuffle and sort during training data_iter = data.BucketIterator( repeat=False, sort=False, dataset=dataset, batch_size=batch_size, batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=shuffle) else: # don't sort/shuffle for validation/inference data_iter = data.BucketIterator( repeat=False, dataset=dataset, batch_size=batch_size, batch_size_fn=batch_size_fn, train=False, sort=False) return data_iter
Example #12
Source File: iterator.py From deepmatcher with BSD 3-Clause "New" or "Revised" License | 5 votes |
def create_batches(self): if self.sort_in_buckets: return data.BucketIterator.create_batches(self) else: return data.Iterator.create_batches(self)
Example #13
Source File: train.py From attention-is-all-you-need-pytorch with MIT License | 5 votes |
def prepare_dataloaders_from_bpe_files(opt, device): batch_size = opt.batch_size MIN_FREQ = 2 if not opt.embs_share_weight: raise data = pickle.load(open(opt.data_pkl, 'rb')) MAX_LEN = data['settings'].max_len field = data['vocab'] fields = (field, field) def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN train = TranslationDataset( fields=fields, path=opt.train_path, exts=('.src', '.trg'), filter_pred=filter_examples_with_length) val = TranslationDataset( fields=fields, path=opt.val_path, exts=('.src', '.trg'), filter_pred=filter_examples_with_length) opt.max_token_seq_len = MAX_LEN + 2 opt.src_pad_idx = opt.trg_pad_idx = field.vocab.stoi[Constants.PAD_WORD] opt.src_vocab_size = opt.trg_vocab_size = len(field.vocab) train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True) val_iterator = BucketIterator(val, batch_size=batch_size, device=device) return train_iterator, val_iterator
Example #14
Source File: train.py From aivivn-tone with MIT License | 5 votes |
def load_data(train_src, train_tgt, val_src, val_tgt, batch_size=64, save_path="checkpoint"): # prepare dataset print("Reading data...") train = Seq2SeqDataset.from_file(train_src, train_tgt) print("Building vocab...") train.build_vocab(max_size=300) val = Seq2SeqDataset.from_file(val_src, val_tgt, share_fields_from=train) src_vocab = train.src_field.vocab tgt_vocab = train.tgt_field.vocab # save vocab with open(os.path.join(save_path, "vocab.src"), "wb") as f: dill.dump(src_vocab, f) with open(os.path.join(save_path, "vocab.tgt"), "wb") as f: dill.dump(tgt_vocab, f) print("Source vocab size:", len(src_vocab)) print("Target vocab size:", len(tgt_vocab)) # data iterator # keep sort=False and shuffle=False to speed up training and reduce memory usage train_iterator = BucketIterator(dataset=train, batch_size=batch_size, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=False, device=device) val_iterator = BucketIterator(dataset=val, batch_size=batch_size, train=False, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=False, device=device) return src_vocab, tgt_vocab, train_iterator, val_iterator
Example #15
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.source), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #16
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.query), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #17
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.text), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #18
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.word), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #19
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE): return BucketIterator(dataset, batch_size=batch_size, device=device)
Example #20
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.text), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #21
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.text), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #22
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.text), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #23
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.texta)): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key)
Example #24
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.texta)): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key)
Example #25
Source File: train.py From aivivn-tone with MIT License | 5 votes |
def load_data_in_parts(train_src, train_tgt, val_src, val_tgt, batch_size=64, save_path="checkpoint"): # prepare dataset print("Reading data...") val = Seq2SeqDataset.from_file(val_src, val_tgt) print("Building vocab...") val.build_vocab(max_size=300) src_vocab = val.src_field.vocab tgt_vocab = val.tgt_field.vocab # save vocab with open(os.path.join(save_path, "vocab.src"), "wb") as f: dill.dump(src_vocab, f) with open(os.path.join(save_path, "vocab.tgt"), "wb") as f: dill.dump(tgt_vocab, f) print("Source vocab size:", len(src_vocab)) print("Target vocab size:", len(tgt_vocab)) # data iterator # keep sort=False and shuffle=False to speed up training and reduce memory usage val_iterator = BucketIterator(dataset=val, batch_size=batch_size, train=False, sort=False, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=False, device=device) return src_vocab, tgt_vocab, list(zip(train_src, train_tgt)), val, val_iterator, batch_size
Example #26
Source File: tool.py From lightKG with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.text), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #27
Source File: tool.py From lightKG with Apache License 2.0 | 5 votes |
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE, sort_key=lambda x: len(x.text), sort_within_batch=True): return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key, sort_within_batch=sort_within_batch)
Example #28
Source File: torchtext_data_loaders.py From quick-nlp with MIT License | 5 votes |
def __init__(self, dataset: Dataset, batch_size: int, source_names: List[str], target_names: List[str], sort_key: Optional[Callable] = None, **kwargs): self.dataset = dataset self.source_names = source_names self.target_names = target_names # sort by the first field if no sort key is given if sort_key is None: def sort_key(x): return getattr(x, self.source_names[0]) device = None if cuda.is_available() else -1 self.dl = BucketIterator(dataset, batch_size=batch_size, sort_key=sort_key, device=device, **kwargs) self.bs = batch_size self.iter = 0
Example #29
Source File: train_E2E.py From conv-emotion with MIT License | 4 votes |
def get_E2E_loaders(path, valid=0.1, batch_size=32): utterance = data.Field(tokenize=tokenizer, lower=True) label = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token)) id = data.Field(use_vocab=False,sequential=False) fields = [('id', id), ('turn1', utterance), ('turn2', utterance), ('turn3', utterance), ('label', label)] train = data.TabularDataset('{}/train.txt'.format(path), format='tsv', fields=fields, skip_header=True) valid = data.TabularDataset('{}/valid.txt'.format(path), format='tsv', fields=fields, skip_header=True) test = data.TabularDataset('{}/test.txt'.format(path), format='tsv', fields=fields, skip_header=True) vectors = vocab.Vectors(name='emojiplusglove.txt', cache='/media/backup/nlp-cic/DialogueRNN/') utterance.build_vocab(train, valid, test, vectors=vectors) #utterance.build_vocab(train, valid, test, vectors='glove.840B.300d') label.build_vocab(train) train_iter = BucketIterator(train, train=True, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) valid_iter = BucketIterator(valid, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) test_iter = BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) return train_iter, valid_iter, test_iter,\ utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\ label.vocab.itos
Example #30
Source File: predict.py From aivivn-tone with MIT License | 4 votes |
def predict(self, test_path, test_cleaned_path, out_path): # read raw data to list lines_id = [] lines_raw = [] lines_cleaned = [] lines_prep = [] with open(test_path, 'r') as f, open(test_cleaned_path, 'r') as fc: for line in f: line_id = line[:3] line_seq = line[4:] lines_id.append(line_id) lines_raw.append(line_seq) lines_prep.append(self.preprocess(line_seq)) for line in fc: lines_cleaned.append(line[4:]) # prepare dataset print("Reading test data...") test = Seq2SeqDataset.from_list(lines_prep) test.src_field.vocab = self.src_vocab # prepare iterator test_iterator = BucketIterator(dataset=test, batch_size=1, train=False, sort=False, sort_within_batch=False, shuffle=False, device=device) # predict with open(out_path, 'w') as writer: with torch.no_grad(): for i, batch in enumerate(test_iterator): # forward through model _, _, output = self.model(batch, has_targets=False, mask_softmax=1.0, teacher_forcing=1.0) # get top-1 predicted_values, predicted_indices = torch.max(output, dim=-1) # convert predicted vocab indices to an actual sentence predicted_seq = [self.tgt_vocab.itos[c] for c in predicted_indices.squeeze(0).tolist()] # output is log_softmax so do exp() predicted_values = predicted_values.exp() # convert to list predicted_values_ = predicted_values.squeeze(0).tolist() # beam search predicted_seq = self.beam_lm(''.join(predicted_seq[1:-1]), predicted_values_[1:-1], lines_raw[i]) # match case and punctuations predicted_seq = self.match_case(predicted_seq, lines_raw[i]) # do some post-processing to match submission output predicted_seq = self.match_output(predicted_seq, lines_cleaned[i]) print("{} {}".format(i, predicted_seq)) # write to file with line_id writer.write(lines_id[i] + ',' + predicted_seq + '\n')