Python torch.utils.data.SequentialSampler() Examples
The following are 30
code examples of torch.utils.data.SequentialSampler().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.utils.data
, or try the search function
.
Example #1
Source File: model.py From MAX-Toxic-Comment-Classifier with Apache License 2.0 | 6 votes |
def _pre_process(self, input): # Record the time spent in the prediction functions self.start_time = time.time() # Converting the input to features test_examples = [InputExample(guid=i, text_a=x, labels=[]) for i, x in enumerate(input)] test_features = convert_examples_to_features(test_examples, self.max_seq_length, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) # Turn input examples into batches test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) test_sampler = SequentialSampler(test_data) self.test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) return test_examples
Example #2
Source File: njuner.py From bert-ner with MIT License | 6 votes |
def _predict_features(self, features, tokens): all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) predict_data = TensorDataset(all_input_ids, all_input_mask) predict_sampler = SequentialSampler(predict_data) predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=self._batch_size) self._model.eval() predict_ids = [] for batch in predict_dataloader: batch = tuple(t.to(self._device) for t in batch) input_ids, input_mask = batch logits = self._model(input_ids, input_mask) logits = logits.detach().cpu().numpy() predict_ids.extend(np.argmax(logits, -1).tolist()) predictions = [] for token_line, predict_line in zip(tokens, predict_ids): predictions.append([self._label_list[label_id] for label_id in predict_line[1: 1+len(token_line)]]) return predictions
Example #3
Source File: data_cls.py From fast-bert with Apache License 2.0 | 6 votes |
def get_dl_from_texts(self, texts): test_examples = [] input_data = [] for index, text in enumerate(texts): test_examples.append(InputExample(index, text, label=None)) input_data.append({"id": index, "text": text}) test_dataset = self.get_dataset_from_examples( test_examples, "test", is_test=True, no_cache=True ) test_sampler = SequentialSampler(test_dataset) return DataLoader( test_dataset, sampler=test_sampler, batch_size=self.batch_size_per_gpu )
Example #4
Source File: run_extract_span.py From SpanABSA with Apache License 2.0 | 6 votes |
def read_eval_data(args, tokenizer, logger): eval_path = os.path.join(args.data_dir, args.predict_file) eval_set = read_absa_data(eval_path) eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, eval_dataloader
Example #5
Source File: run_joint_span.py From SpanABSA with Apache License 2.0 | 6 votes |
def read_eval_data(args, tokenizer, logger): eval_path = os.path.join(args.data_dir, args.predict_file) eval_set = read_absa_data(eval_path) eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, eval_dataloader
Example #6
Source File: data_loading.py From pytorch-lightning with Apache License 2.0 | 6 votes |
def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: # don't do anything if it's not a dataloader is_dataloader = isinstance(dataloader, DataLoader) # don't manipulate iterable datasets is_iterable_ds = _has_iterable_dataset(dataloader) if not is_dataloader or is_iterable_ds: return dataloader need_dist_sampler = (self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu) if self.replace_sampler_ddp and need_dist_sampler: if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)): raise MisconfigurationException( 'You seem to have configured a sampler in your DataLoader. This will be replaced ' ' by `DistributedSampler` since `replace_sampler_ddp` is True and you are using' ' distributed training. Either remove the sampler from your DataLoader or set' ' `replace_sampler_ddp`=False if you want to use your custom sampler.') # replace with distributed sampler sampler = self._get_distributed_sampler(dataloader) dataloader = self.replace_sampler(dataloader, sampler) return dataloader
Example #7
Source File: base_task.py From Doc2EDAG with MIT License | 6 votes |
def prepare_data_loader(self, dataset, batch_size, rand_flag=True): # prepare data loader if rand_flag: data_sampler = RandomSampler(dataset) else: data_sampler = SequentialSampler(dataset) if self.custom_collate_fn is None: dataloader = DataLoader(dataset, batch_size=batch_size, sampler=data_sampler) else: dataloader = DataLoader(dataset, batch_size=batch_size, sampler=data_sampler, collate_fn=self.custom_collate_fn) return dataloader
Example #8
Source File: bert.py From BERT-SQuAD with GNU Affero General Public License v3.0 | 5 votes |
def predict(self,passage :str,question :str): example = input_to_squad_example(passage,question) features = squad_examples_to_features(example,self.tokenizer,self.max_seq_length,self.doc_stride,self.max_query_length) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1) all_results = [] for batch in eval_dataloader: batch = tuple(t.to(self.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } example_indices = batch[3] outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) result = RawResult(unique_id = unique_id, start_logits = to_list(outputs[0][i]), end_logits = to_list(outputs[1][i])) all_results.append(result) answer = get_answer(example,features,all_results,self.n_best_size,self.max_answer_length,self.do_lower_case) return answer
Example #9
Source File: run_albert.py From Bert-Multi-Label-Text-Classification with MIT License | 5 votes |
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=False) lines = list(zip(sentences, targets)) processor = AlbertProcessor(spm_model_file=config['albert_vocab_path'], do_lower_case=args.do_lower_case, vocab_file=None) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) model = AlbertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model,logger=logger,n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) print(result)
Example #10
Source File: dataloader.py From nonechucks with MIT License | 5 votes |
def _replace_default_samplers(cls): cls.sequential = data.dataloader.SequentialSampler cls.random = data.dataloader.RandomSampler def safe_sampler_callable(sampler_cls, dataset): return SafeSampler(dataset, sampler_cls(dataset)) data.dataloader.SequentialSampler = partial( safe_sampler_callable, data.SequentialSampler ) data.dataloader.RandomSampler = partial( safe_sampler_callable, data.RandomSampler )
Example #11
Source File: post.py From denspi with Apache License 2.0 | 5 votes |
def convert_question_features_to_dataloader(query_eval_features, fp16, local_rank, predict_batch_size): all_input_ids_ = torch.tensor([f.input_ids for f in query_eval_features], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in query_eval_features], dtype=torch.long) all_example_index_ = torch.arange(all_input_ids_.size(0), dtype=torch.long) if fp16: all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) question_data = TensorDataset(all_input_ids_, all_input_mask_, all_example_index_) if local_rank == -1: question_sampler = SequentialSampler(question_data) else: question_sampler = DistributedSampler(question_data) question_dataloader = DataLoader(question_data, sampler=question_sampler, batch_size=predict_batch_size) return question_dataloader
Example #12
Source File: induce_dep_trees.py From interpret_bert with GNU General Public License v3.0 | 5 votes |
def save(args, model, tokenizer, device): # convert data to ids examples = [args.sentence_text] features = convert_examples_to_features( examples=examples, seq_length=2+get_max_seq_length(examples, tokenizer), tokenizer=tokenizer) # extract and write dependency parses all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, pooled_layer, raw_attn_layers = model(input_ids, token_type_ids=None, attention_mask=input_mask) cur_tokens = features[example_indices[0]].tokens[1:-1] cur_layer = raw_attn_layers[args.layer_id-1].squeeze() cur_head = cur_layer[args.head_id-1] cur_attn_matrix = cur_head[0:len(cur_tokens)+1, 0:len(cur_tokens)+1].detach().cpu().numpy() cur_attn_matrix[:,0] = -1. cur_attn_matrix[args.sentence_root,0] = 1.0 np.fill_diagonal(cur_attn_matrix, -1.) mst_out = mst(cur_attn_matrix) tokens = ['<root>'] + cur_tokens print('tokens ==>') print(tokens) print('heads ==>') print([tokens[head_id] for head_id in mst_out]) break
Example #13
Source File: dataloader.py From nonechucks with MIT License | 5 votes |
def _restore_default_samplers(cls): data.dataloader.SequentialSampler = cls.sequential data.dataloader.RandomSampler = cls.random
Example #14
Source File: dcca.py From mvlearn with Apache License 2.0 | 5 votes |
def _get_outputs(self, x1, x2): """ Private function to get the transformed data and the corresponding loss for the given inputs. Parameters ---------- x1 : torch.tensor Input view 1 data. x2 : torch.tensor Input view 2 data. Returns ------- losses : list List of losses for each batch taken from the input data. outputs : list of tensors outputs[i] is the output of the deep models for view i. """ with torch.no_grad(): self.model_.eval() data_size = x1.size(0) batch_idxs = list(BatchSampler(SequentialSampler(range(data_size)), batch_size=self.batch_size_, drop_last=False)) losses = [] outputs1 = [] outputs2 = [] for batch_idx in batch_idxs: batch_x1 = x1[batch_idx, :] batch_x2 = x2[batch_idx, :] o1, o2 = self.model_(batch_x1, batch_x2) outputs1.append(o1) outputs2.append(o2) loss = self.loss_(o1, o2) losses.append(loss.item()) outputs = [torch.cat(outputs1, dim=0).cpu().numpy(), torch.cat(outputs2, dim=0).cpu().numpy()] return losses, outputs
Example #15
Source File: sequence_classification.py From nlp-architect with Apache License 2.0 | 5 votes |
def inference( self, examples: List[SequenceClsInputExample], max_seq_length: int, batch_size: int = 64, evaluate=False, ): """ Run inference on given examples Args: examples (List[SequenceClsInputExample]): examples batch_size (int, optional): batch size. Defaults to 64. Returns: logits """ data_set = self.convert_to_tensors( examples, max_seq_length=max_seq_length, include_labels=evaluate ) inf_sampler = SequentialSampler(data_set) inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size) logits = self._evaluate(inf_dataloader) if not evaluate: preds = self._postprocess_logits(logits) else: logits, label_ids = logits preds = self._postprocess_logits(logits) self.evaluate_predictions(logits, label_ids) return preds
Example #16
Source File: token_classification.py From nlp-architect with Apache License 2.0 | 5 votes |
def inference( self, examples: List[TokenClsInputExample], max_seq_length: int, batch_size: int = 64 ): """ Run inference on given examples Args: examples (List[SequenceClsInputExample]): examples batch_size (int, optional): batch size. Defaults to 64. Returns: logits """ data_set = self.convert_to_tensors( examples, max_seq_length=max_seq_length, include_labels=False ) inf_sampler = SequentialSampler(data_set) inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size) logits = self._evaluate(inf_dataloader) active_positions = data_set.tensors[-1].view(len(data_set), -1) != 0.0 logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) res_ids = [] for i in range(logits.size()[0]): res_ids.append(logits[i][active_positions[i]].detach().cpu().numpy()) output = [] for tag_ids, ex in zip(res_ids, examples): tokens = ex.tokens tags = [self.labels_id_map.get(t, "O") for t in tag_ids] output.append((tokens, tags)) return output
Example #17
Source File: tagging.py From nlp-architect with Apache License 2.0 | 5 votes |
def inference(self, examples: List[TokenClsInputExample], batch_size: int = 64): """ Do inference on given examples Args: examples (List[TokenClsInputExample]): examples batch_size (int, optional): batch size. Defaults to 64. Returns: List(tuple): a list of tuples of tokens, tags predicted by model """ data_set = self.convert_to_tensors(examples, include_labels=False) inf_sampler = SequentialSampler(data_set) inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size) logits = self.evaluate(inf_dataloader) active_positions = data_set.tensors[-1].view(len(data_set), -1) != 0.0 logits = torch.argmax(F.log_softmax(logits[0], dim=2), dim=2) res_ids = [] for i in range(logits.size()[0]): res_ids.append(logits[i][active_positions[i]].detach().cpu().numpy()) output = [] for tag_ids, ex in zip(res_ids, examples): tokens = ex.tokens tags = [self.label_id_str.get(t, "O") for t in tag_ids] output.append((tokens, tags)) return output
Example #18
Source File: pytorch_utils.py From nlp-recipes with MIT License | 5 votes |
def dataloader_from_dataset( ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False ): """Creates a PyTorch DataLoader given a Dataset object. Args: ds (torch.utils.data.DataSet): A PyTorch dataset. batch_size (int, optional): Batch size. If more than 1 gpu is used, this would be the batch size per gpu. Defaults to 32. num_gpus (int, optional): The number of GPUs to be used. Defaults to None. shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False. distributed (book, optional): If True, a DistributedSampler is used. Defaults to False. Returns: Module, DataParallel: A PyTorch Module or a DataParallel wrapper (when multiple gpus are used). """ if num_gpus is None: num_gpus = torch.cuda.device_count() batch_size = batch_size * max(1, num_gpus) if distributed: sampler = DistributedSampler(ds) else: sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds) return DataLoader(ds, sampler=sampler, batch_size=batch_size)
Example #19
Source File: run_hnn.py From mt-dnn with MIT License | 5 votes |
def run_predict(args, model, device, test_data, prefix=None): # Run prediction for full data eval_results=OrderedDict() for test_item in test_data: torch.cuda.empty_cache() name = test_item.name test_sampler = SequentialSampler(test_item.data) test_dataloader = SequentialDataLoader(test_item.data, sampler=test_sampler, batch_size=args.predict_batch_size, num_workers = args.worker_num) model.eval() sm_predicts=[] lm_predicts=[] en_predicts=[] for input_ids,tids,_ in tqdm(test_dataloader, ncols=80, desc='Predicting: {}'.format(prefix)): with torch.no_grad(): sm_logits, lm_logits, en_logits,_ = model(input_ids, tids) if sm_logits is not None: sm_predicts.append(sm_logits.detach().cpu().numpy()) if lm_logits is not None: lm_predicts.append(lm_logits.detach().cpu().numpy()) if en_logits is not None: en_predicts.append(en_logits.detach().cpu().numpy()) def pred(predicts, tag): output_test_file = os.path.join(args.output_dir, "test_results_{}_{}_{}.txt".format(name, prefix, tag)) logger.info("***** Dump prediction results-{}-{}-{} *****".format(name, prefix, tag)) logger.info("Location: {}".format(output_test_file)) np.savetxt(output_test_file, predicts, delimiter='\t') predict_fn = test_item.predict_fn if predict_fn: predict_fn(predicts, args.output_dir, name, prefix, tag=tag) if len(lm_predicts)>0: lm_predicts = np.concatenate(lm_predicts, axis=0) pred(lm_predicts, 'LM-') if len(sm_predicts)>0: sm_predicts = np.concatenate(sm_predicts, axis=0) pred(sm_predicts, 'SIM-') if len(en_predicts)>0: en_predicts = np.concatenate(en_predicts, axis=0) pred(en_predicts, 'EN-Avg-')
Example #20
Source File: run_sequence_level_classification.py From ZEN with Apache License 2.0 | 5 votes |
def evaluate(args, model, tokenizer, ngram_dict, processor, label_list): eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test") # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_dataset) else: eval_sampler = DistributedSampler(eval_dataset) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # Eval! logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() preds = [] out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, input_ngram_ids, ngram_position_matrix, \ ngram_lengths, ngram_seg_ids, ngram_masks = batch with torch.no_grad(): logits = model(input_ids=input_ids, input_ngram_ids=input_ngram_ids, ngram_position_matrix=ngram_position_matrix, labels=None, head_mask=None) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) preds = np.argmax(preds[0], axis=1) return compute_metrics(args.task_name, preds, out_label_ids)
Example #21
Source File: run_squad_document_full_e2e.py From RE3QA with Apache License 2.0 | 5 votes |
def build_eval_data(args, eval_examples, eval_features, filtered_eval_features, filtered_rank_logits, logger): predict_batch_size_for_rank = 2 * args.predict_batch_size logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Num split filtered features = %d", len(filtered_eval_features)) logger.info("Batch size for ranker = %d", predict_batch_size_for_rank) logger.info("Batch size for reader = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_rank_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_rank_sampler = SequentialSampler(eval_rank_data) else: eval_rank_sampler = DistributedSampler(eval_rank_data) eval_rank_dataloader = DataLoader(eval_rank_data, sampler=eval_rank_sampler, batch_size=predict_batch_size_for_rank) all_input_ids = torch.tensor([f.input_ids for f in filtered_eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in filtered_eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in filtered_eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_read_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_read_sampler = SequentialSampler(eval_read_data) else: eval_read_sampler = DistributedSampler(eval_read_data) eval_read_dataloader = DataLoader(eval_read_data, sampler=eval_read_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, filtered_eval_features, filtered_rank_logits, eval_rank_dataloader, \ eval_read_dataloader
Example #22
Source File: extract_features.py From interpret_bert with GNU General Public License v3.0 | 5 votes |
def save(args, model, tokenizer, device): # convert data to ids examples = read_examples(args.data_file, 0.09, 0.01, 50) # default numbers obtained from Linzen et al. # extract and write features for s_name in examples: s_instances = examples[s_name] output_file = args.output_folder + s_name + ".json" features = convert_examples_to_features(s_instances, seqLen=2+get_max_seq_length(s_instances, tokenizer), tokenizer=tokenizer) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) pbar = tqdm(total=len(s_instances)//args.batch_size) with open(output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) for b, example_index in enumerate(example_indices): unique_id = example_index.item() output_json = collections.OrderedDict() output_json["linex_index"] = unique_id verb_index = s_instances[unique_id]['verb_index']-1 layers = [] for layer_index in range(len(all_encoder_layers)): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layers.append([round(x.item(), 6) for x in layer_output[b][verb_index]]) output_json["verb_layers"] = layers output_json["linzen_info"] = s_instances[unique_id] writer.write(json.dumps(output_json) + "\n") pbar.update(1) pbar.close() print('written features to %s'%output_file)
Example #23
Source File: extract_features.py From interpret_bert with GNU General Public License v3.0 | 5 votes |
def save(args, model, tokenizer, device): # convert data to ids examples = read_examples(args.train_file, 3000, 500) # default number of labeled and unlabeld chunks to consider are obtained from https://aclweb.org/anthology/D18-1179 features = convert_examples_to_features(examples=examples, seq_length=2+get_max_seq_length(examples, tokenizer), tokenizer=tokenizer) chunk_spans = get_chunk_spans(examples, features) # extract and write features all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) pbar = tqdm(total=len(examples)//args.batch_size) with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) for b, example_index in enumerate(example_indices): feature_info = features[example_index.item()] unique_id = int(feature_info.unique_id) example_info, chunk_info = examples[unique_id], chunk_spans[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id output_json["label"] = example_info.label output_json["tokens"] = feature_info.tokens output_json["chunk_start_idx"] = chunk_info[0] output_json["chunk_end_idx"] = chunk_info[1] span_start_layers, span_end_layers = [], [] for layer_index in range(len(all_encoder_layers)): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() span_start_layers.append([round(x.item(), 6) for x in layer_output[b][chunk_info[0]]]) span_end_layers.append([round(x.item(), 6) for x in layer_output[b][chunk_info[1]]]) output_json["start_layer"] = span_start_layers output_json["end_layer"] = span_end_layers writer.write(json.dumps(output_json) + "\n") pbar.update(1) pbar.close() print('written features to %s'%(args.output_file))
Example #24
Source File: data.py From fast-bert with Apache License 2.0 | 5 votes |
def get_dl_from_texts(self, texts): test_examples = [] input_data = [] for index, text in enumerate(texts): test_examples.append(InputExample(index, text, label=None)) input_data.append({ 'id': index, 'text': text }) test_features = convert_examples_to_features(test_examples, label_list=self.labels, tokenizer=self.tokenizer, max_seq_length=self.maxlen) all_input_ids = torch.tensor( [f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in test_features], dtype=torch.long) test_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids) test_sampler = SequentialSampler(test_data) return DataLoader(test_data, sampler=test_sampler, batch_size=self.bs)
Example #25
Source File: data_abs.py From fast-bert with Apache License 2.0 | 5 votes |
def get_dl_from_texts(self, texts): dataset = SummarizationInMemoryDataset(texts) sampler = SequentialSampler(dataset) collate_fn = lambda data: collate( data, self.tokenizer, block_size=self.max_seq_length, device=self.device ) return DataLoader( dataset, sampler=sampler, batch_size=self.batch_size_per_gpu, collate_fn=collate_fn, )
Example #26
Source File: run_cls_span.py From SpanABSA with Apache License 2.0 | 5 votes |
def pipeline_eval_data(args, tokenizer, logger): if args.debug: args.predict_batch_size = 8 eval_path = os.path.join(args.data_dir, args.predict_file) eval_set = read_absa_data(eval_path) eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) assert args.extraction_file is not None eval_extract_preds = [] extract_predictions = pickle.load(open(args.extraction_file, 'rb')) extract_dict = {} for pred in extract_predictions: extract_dict[pred.unique_id] = pred for eval_feature in eval_features: eval_extract_preds.append(extract_dict[eval_feature.unique_id]) assert len(eval_extract_preds) == len(eval_features) logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_span_starts = torch.tensor([f.start_indexes for f in eval_extract_preds], dtype=torch.long) all_span_ends = torch.tensor([f.end_indexes for f in eval_extract_preds], dtype=torch.long) all_label_masks = torch.tensor([f.span_masks for f in eval_extract_preds], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends, all_label_masks, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, eval_dataloader
Example #27
Source File: run_cls_span.py From SpanABSA with Apache License 2.0 | 5 votes |
def read_eval_data(args, tokenizer, logger): if args.debug: args.predict_batch_size = 8 eval_path = os.path.join(args.data_dir, args.predict_file) eval_set = read_absa_data(eval_path) eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, args.verbose_logging, logger) logger.info("Num orig examples = %d", len(eval_examples)) logger.info("Num split features = %d", len(eval_features)) logger.info("Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_span_starts = torch.tensor([f.start_indexes for f in eval_features], dtype=torch.long) all_span_ends = torch.tensor([f.end_indexes for f in eval_features], dtype=torch.long) all_label_masks = torch.tensor([f.label_masks for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends, all_label_masks, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) return eval_examples, eval_features, eval_dataloader
Example #28
Source File: run_bert.py From Bert-Multi-Label-Text-Classification with MIT License | 5 votes |
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=False) lines = list(zip(sentences, targets)) processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model, logger=logger, n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) print(result)
Example #29
Source File: train.py From subword-qac with MIT License | 5 votes |
def test(model, tokenizer, test_data, args): logger.info("Test starts!") model_load(args.model_dir, model) model = model.to(device) test_dataset = QueryDataset(test_data) test_data_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=args.bsz, num_workers=args.num_workers, collate_fn=lambda x: collate_fn(x, tokenizer, args.sample, args.max_seq_len)) test_loss, test_str = evaluate(model, test_data_loader) logger.info(f"| test | {test_str}")
Example #30
Source File: run_xlnet.py From Bert-Multi-Label-Text-Classification with MIT License | 5 votes |
def run_test(args): from pybert.io.task_data import TaskData from pybert.test.predictor import Predictor data = TaskData() targets, sentences = data.read_data(raw_data_path=config['test_path'], preprocessor=EnglishPreProcessor(), is_train=True) lines = zip(sentences, targets) processor = XlnetProcessor(vocab_path=config['xlnet_vocab_path'], do_lower_case=args.do_lower_case) label_list = processor.get_labels() id2label = {i: label for i, label in enumerate(label_list)} test_data = processor.get_test(lines=lines) test_examples = processor.create_examples(lines=test_data, example_type='test', cached_examples_file=config[ 'data_dir'] / f"cached_test_examples_{args.arch}") test_features = processor.create_features(examples=test_examples, max_seq_len=args.eval_max_seq_len, cached_features_file=config[ 'data_dir'] / "cached_test_features_{}_{}".format( args.eval_max_seq_len, args.arch )) test_dataset = processor.create_dataset(test_features) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) model = XlnetForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list)) # ----------- predicting logger.info('model predicting....') predictor = Predictor(model=model,logger=logger,n_gpu=args.n_gpu) result = predictor.predict(data=test_dataloader) print(result)