Python torch.utils.data.SequentialSampler() Examples

The following are 30 code examples of torch.utils.data.SequentialSampler(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.utils.data , or try the search function .
Example #1
Source File: model.py    From MAX-Toxic-Comment-Classifier with Apache License 2.0 6 votes vote down vote up
def _pre_process(self, input):
        # Record the time spent in the prediction functions
        self.start_time = time.time()

        # Converting the input to features
        test_examples = [InputExample(guid=i, text_a=x, labels=[]) for i, x in enumerate(input)]
        test_features = convert_examples_to_features(test_examples, self.max_seq_length, self.tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)

        # Turn input examples into batches
        test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
        test_sampler = SequentialSampler(test_data)
        self.test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size)

        return test_examples 
Example #2
Source File: njuner.py    From bert-ner with MIT License 6 votes vote down vote up
def _predict_features(self, features, tokens):
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        predict_data = TensorDataset(all_input_ids, all_input_mask)
        predict_sampler = SequentialSampler(predict_data)
        predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=self._batch_size)
        self._model.eval()
        predict_ids = []
        for batch in predict_dataloader:
            batch = tuple(t.to(self._device) for t in batch)
            input_ids, input_mask = batch
            logits = self._model(input_ids, input_mask)
            logits = logits.detach().cpu().numpy()
            predict_ids.extend(np.argmax(logits, -1).tolist())
        predictions = []
        for token_line, predict_line in zip(tokens, predict_ids):
            predictions.append([self._label_list[label_id] for label_id in predict_line[1: 1+len(token_line)]])
        return predictions 
Example #3
Source File: data_cls.py    From fast-bert with Apache License 2.0 6 votes vote down vote up
def get_dl_from_texts(self, texts):

        test_examples = []
        input_data = []

        for index, text in enumerate(texts):
            test_examples.append(InputExample(index, text, label=None))
            input_data.append({"id": index, "text": text})

        test_dataset = self.get_dataset_from_examples(
            test_examples, "test", is_test=True, no_cache=True
        )

        test_sampler = SequentialSampler(test_dataset)
        return DataLoader(
            test_dataset, sampler=test_sampler, batch_size=self.batch_size_per_gpu
        ) 
Example #4
Source File: run_extract_span.py    From SpanABSA with Apache License 2.0 6 votes vote down vote up
def read_eval_data(args, tokenizer, logger):
    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)

    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader 
Example #5
Source File: run_joint_span.py    From SpanABSA with Apache License 2.0 6 votes vote down vote up
def read_eval_data(args, tokenizer, logger):
    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)

    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader 
Example #6
Source File: data_loading.py    From pytorch-lightning with Apache License 2.0 6 votes vote down vote up
def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader:

        # don't do anything if it's not a dataloader
        is_dataloader = isinstance(dataloader, DataLoader)
        # don't manipulate iterable datasets
        is_iterable_ds = _has_iterable_dataset(dataloader)

        if not is_dataloader or is_iterable_ds:
            return dataloader
        need_dist_sampler = (self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu)

        if self.replace_sampler_ddp and need_dist_sampler:
            if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)):
                raise MisconfigurationException(
                    'You seem to have configured a sampler in your DataLoader. This will be replaced '
                    ' by `DistributedSampler` since `replace_sampler_ddp` is True and you are using'
                    ' distributed training. Either remove the sampler from your DataLoader or set'
                    ' `replace_sampler_ddp`=False if you want to use your custom sampler.')

            # replace with distributed sampler
            sampler = self._get_distributed_sampler(dataloader)
            dataloader = self.replace_sampler(dataloader, sampler)

        return dataloader 
Example #7
Source File: base_task.py    From Doc2EDAG with MIT License 6 votes vote down vote up
def prepare_data_loader(self, dataset, batch_size, rand_flag=True):
        # prepare data loader
        if rand_flag:
            data_sampler = RandomSampler(dataset)
        else:
            data_sampler = SequentialSampler(dataset)

        if self.custom_collate_fn is None:
            dataloader = DataLoader(dataset,
                                    batch_size=batch_size,
                                    sampler=data_sampler)
        else:
            dataloader = DataLoader(dataset,
                                    batch_size=batch_size,
                                    sampler=data_sampler,
                                    collate_fn=self.custom_collate_fn)

        return dataloader 
Example #8
Source File: bert.py    From BERT-SQuAD with GNU Affero General Public License v3.0 5 votes vote down vote up
def predict(self,passage :str,question :str):
        example = input_to_squad_example(passage,question)
        features = squad_examples_to_features(example,self.tokenizer,self.max_seq_length,self.doc_stride,self.max_query_length)
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index)
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)
        all_results = []
        for batch in eval_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                        'attention_mask': batch[1],
                        'token_type_ids': batch[2]  
                        }
                example_indices = batch[3]
                outputs = self.model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                result = RawResult(unique_id    = unique_id,
                                    start_logits = to_list(outputs[0][i]),
                                    end_logits   = to_list(outputs[1][i]))
                all_results.append(result)
        answer = get_answer(example,features,all_results,self.n_best_size,self.max_answer_length,self.do_lower_case)
        return answer 
Example #9
Source File: run_albert.py    From Bert-Multi-Label-Text-Classification with MIT License 5 votes vote down vote up
def run_test(args):
    from pybert.io.task_data import TaskData
    from pybert.test.predictor import Predictor
    data = TaskData()
    targets, sentences = data.read_data(raw_data_path=config['test_path'],
                                        preprocessor=EnglishPreProcessor(),
                                        is_train=False)
    lines = list(zip(sentences, targets))
    processor = AlbertProcessor(spm_model_file=config['albert_vocab_path'], do_lower_case=args.do_lower_case,
                                vocab_file=None)
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}

    test_data = processor.get_test(lines=lines)
    test_examples = processor.create_examples(lines=test_data,
                                              example_type='test',
                                              cached_examples_file=config[
                                             'data_dir'] / f"cached_test_examples_{args.arch}")
    test_features = processor.create_features(examples=test_examples,
                                              max_seq_len=args.eval_max_seq_len,
                                              cached_features_file=config[
                                              'data_dir'] / "cached_test_features_{}_{}".format(
                                                  args.eval_max_seq_len, args.arch
                                              ))
    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,
                                 collate_fn=collate_fn)
    model = AlbertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list))

    # ----------- predicting
    logger.info('model predicting....')
    predictor = Predictor(model=model,logger=logger,n_gpu=args.n_gpu)
    result = predictor.predict(data=test_dataloader)
    print(result) 
Example #10
Source File: dataloader.py    From nonechucks with MIT License 5 votes vote down vote up
def _replace_default_samplers(cls):
        cls.sequential = data.dataloader.SequentialSampler
        cls.random = data.dataloader.RandomSampler

        def safe_sampler_callable(sampler_cls, dataset):
            return SafeSampler(dataset, sampler_cls(dataset))

        data.dataloader.SequentialSampler = partial(
            safe_sampler_callable, data.SequentialSampler
        )
        data.dataloader.RandomSampler = partial(
            safe_sampler_callable, data.RandomSampler
        ) 
Example #11
Source File: post.py    From denspi with Apache License 2.0 5 votes vote down vote up
def convert_question_features_to_dataloader(query_eval_features, fp16, local_rank, predict_batch_size):
    all_input_ids_ = torch.tensor([f.input_ids for f in query_eval_features], dtype=torch.long)
    all_input_mask_ = torch.tensor([f.input_mask for f in query_eval_features], dtype=torch.long)
    all_example_index_ = torch.arange(all_input_ids_.size(0), dtype=torch.long)
    if fp16:
        all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_))

    question_data = TensorDataset(all_input_ids_, all_input_mask_, all_example_index_)

    if local_rank == -1:
        question_sampler = SequentialSampler(question_data)
    else:
        question_sampler = DistributedSampler(question_data)
    question_dataloader = DataLoader(question_data, sampler=question_sampler, batch_size=predict_batch_size)
    return question_dataloader 
Example #12
Source File: induce_dep_trees.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def save(args, model, tokenizer, device):
  # convert data to ids
  examples = [args.sentence_text]
  features = convert_examples_to_features(
        examples=examples, seq_length=2+get_max_seq_length(examples, 
                                                           tokenizer),
        tokenizer=tokenizer)

  # extract and write dependency parses
  all_input_ids = torch.tensor([f.input_ids for f in features], 
                               dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], 
                              dtype=torch.long)
  all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
  eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
  eval_sampler = SequentialSampler(eval_data)
  eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, 
                             batch_size=args.batch_size)
  for input_ids, input_mask, example_indices in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    all_encoder_layers, pooled_layer, raw_attn_layers = model(input_ids, 
                                                      token_type_ids=None,
                                                 attention_mask=input_mask)
    cur_tokens = features[example_indices[0]].tokens[1:-1]
    cur_layer = raw_attn_layers[args.layer_id-1].squeeze()
    cur_head = cur_layer[args.head_id-1]
    cur_attn_matrix = cur_head[0:len(cur_tokens)+1, 0:len(cur_tokens)+1].detach().cpu().numpy()
    cur_attn_matrix[:,0] = -1.
    cur_attn_matrix[args.sentence_root,0] = 1.0
    np.fill_diagonal(cur_attn_matrix, -1.)
    mst_out = mst(cur_attn_matrix)
    tokens = ['<root>'] + cur_tokens
    print('tokens ==>')
    print(tokens)
    print('heads ==>')
    print([tokens[head_id] for head_id in mst_out])
    break 
Example #13
Source File: dataloader.py    From nonechucks with MIT License 5 votes vote down vote up
def _restore_default_samplers(cls):
        data.dataloader.SequentialSampler = cls.sequential
        data.dataloader.RandomSampler = cls.random 
Example #14
Source File: dcca.py    From mvlearn with Apache License 2.0 5 votes vote down vote up
def _get_outputs(self, x1, x2):
        """
        Private function to get the transformed data and the corresponding
        loss for the given inputs.

        Parameters
        ----------
        x1 : torch.tensor
            Input view 1 data.
        x2 : torch.tensor
            Input view 2 data.

        Returns
        -------
        losses : list
            List of losses for each batch taken from the input data.
        outputs : list of tensors
            outputs[i] is the output of the deep models for view i.
        """
        with torch.no_grad():
            self.model_.eval()
            data_size = x1.size(0)
            batch_idxs = list(BatchSampler(SequentialSampler(range(data_size)),
                              batch_size=self.batch_size_,
                              drop_last=False))
            losses = []
            outputs1 = []
            outputs2 = []
            for batch_idx in batch_idxs:
                batch_x1 = x1[batch_idx, :]
                batch_x2 = x2[batch_idx, :]
                o1, o2 = self.model_(batch_x1, batch_x2)
                outputs1.append(o1)
                outputs2.append(o2)
                loss = self.loss_(o1, o2)
                losses.append(loss.item())
        outputs = [torch.cat(outputs1, dim=0).cpu().numpy(),
                   torch.cat(outputs2, dim=0).cpu().numpy()]

        return losses, outputs 
Example #15
Source File: sequence_classification.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def inference(
        self,
        examples: List[SequenceClsInputExample],
        max_seq_length: int,
        batch_size: int = 64,
        evaluate=False,
    ):
        """
        Run inference on given examples

        Args:
            examples (List[SequenceClsInputExample]): examples
            batch_size (int, optional): batch size. Defaults to 64.

        Returns:
            logits
        """
        data_set = self.convert_to_tensors(
            examples, max_seq_length=max_seq_length, include_labels=evaluate
        )
        inf_sampler = SequentialSampler(data_set)
        inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size)
        logits = self._evaluate(inf_dataloader)
        if not evaluate:
            preds = self._postprocess_logits(logits)
        else:
            logits, label_ids = logits
            preds = self._postprocess_logits(logits)
            self.evaluate_predictions(logits, label_ids)
        return preds 
Example #16
Source File: token_classification.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def inference(
        self, examples: List[TokenClsInputExample], max_seq_length: int, batch_size: int = 64
    ):
        """
        Run inference on given examples

        Args:
            examples (List[SequenceClsInputExample]): examples
            batch_size (int, optional): batch size. Defaults to 64.

        Returns:
            logits
        """
        data_set = self.convert_to_tensors(
            examples, max_seq_length=max_seq_length, include_labels=False
        )
        inf_sampler = SequentialSampler(data_set)
        inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size)
        logits = self._evaluate(inf_dataloader)
        active_positions = data_set.tensors[-1].view(len(data_set), -1) != 0.0
        logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
        res_ids = []
        for i in range(logits.size()[0]):
            res_ids.append(logits[i][active_positions[i]].detach().cpu().numpy())
        output = []
        for tag_ids, ex in zip(res_ids, examples):
            tokens = ex.tokens
            tags = [self.labels_id_map.get(t, "O") for t in tag_ids]
            output.append((tokens, tags))
        return output 
Example #17
Source File: tagging.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def inference(self, examples: List[TokenClsInputExample], batch_size: int = 64):
        """
        Do inference on given examples

        Args:
            examples (List[TokenClsInputExample]): examples
            batch_size (int, optional): batch size. Defaults to 64.

        Returns:
            List(tuple): a list of tuples of tokens, tags predicted by model
        """
        data_set = self.convert_to_tensors(examples, include_labels=False)
        inf_sampler = SequentialSampler(data_set)
        inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size)
        logits = self.evaluate(inf_dataloader)
        active_positions = data_set.tensors[-1].view(len(data_set), -1) != 0.0
        logits = torch.argmax(F.log_softmax(logits[0], dim=2), dim=2)
        res_ids = []
        for i in range(logits.size()[0]):
            res_ids.append(logits[i][active_positions[i]].detach().cpu().numpy())
        output = []
        for tag_ids, ex in zip(res_ids, examples):
            tokens = ex.tokens
            tags = [self.label_id_str.get(t, "O") for t in tag_ids]
            output.append((tokens, tags))
        return output 
Example #18
Source File: pytorch_utils.py    From nlp-recipes with MIT License 5 votes vote down vote up
def dataloader_from_dataset(
    ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False
):
    """Creates a PyTorch DataLoader given a Dataset object.

    Args:
        ds (torch.utils.data.DataSet): A PyTorch dataset.
        batch_size (int, optional): Batch size.
            If more than 1 gpu is used, this would be the batch size per gpu.
            Defaults to 32.
        num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
        shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
        distributed (book, optional): If True, a DistributedSampler is used.
        Defaults to False.

    Returns:
        Module, DataParallel: A PyTorch Module or
            a DataParallel wrapper (when multiple gpus are used).
    """
    if num_gpus is None:
        num_gpus = torch.cuda.device_count()

    batch_size = batch_size * max(1, num_gpus)

    if distributed:
        sampler = DistributedSampler(ds)
    else:
        sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)

    return DataLoader(ds, sampler=sampler, batch_size=batch_size) 
Example #19
Source File: run_hnn.py    From mt-dnn with MIT License 5 votes vote down vote up
def run_predict(args, model, device, test_data, prefix=None):
  # Run prediction for full data
  eval_results=OrderedDict()
  for test_item in test_data:
    torch.cuda.empty_cache()
    name = test_item.name
    test_sampler = SequentialSampler(test_item.data)
    test_dataloader = SequentialDataLoader(test_item.data, sampler=test_sampler, batch_size=args.predict_batch_size, num_workers = args.worker_num)
    model.eval()
    sm_predicts=[]
    lm_predicts=[]
    en_predicts=[]
    for input_ids,tids,_ in tqdm(test_dataloader, ncols=80, desc='Predicting: {}'.format(prefix)):
      with torch.no_grad():
        sm_logits, lm_logits, en_logits,_ = model(input_ids, tids)
      if sm_logits is not None:
        sm_predicts.append(sm_logits.detach().cpu().numpy())
      if lm_logits is not None:
        lm_predicts.append(lm_logits.detach().cpu().numpy())
      if en_logits is not None:
        en_predicts.append(en_logits.detach().cpu().numpy())
    def pred(predicts, tag):
      output_test_file = os.path.join(args.output_dir, "test_results_{}_{}_{}.txt".format(name, prefix, tag))
      logger.info("***** Dump prediction results-{}-{}-{} *****".format(name, prefix, tag))
      logger.info("Location: {}".format(output_test_file))
      np.savetxt(output_test_file, predicts, delimiter='\t')
      predict_fn = test_item.predict_fn
      if predict_fn:
        predict_fn(predicts, args.output_dir, name, prefix, tag=tag)
    if len(lm_predicts)>0:
      lm_predicts = np.concatenate(lm_predicts, axis=0)
      pred(lm_predicts, 'LM-')

    if len(sm_predicts)>0:
      sm_predicts = np.concatenate(sm_predicts, axis=0)
      pred(sm_predicts, 'SIM-')

    if len(en_predicts)>0:
      en_predicts = np.concatenate(en_predicts, axis=0)
      pred(en_predicts, 'EN-Avg-') 
Example #20
Source File: run_sequence_level_classification.py    From ZEN with Apache License 2.0 5 votes vote down vote up
def evaluate(args, model, tokenizer, ngram_dict, processor, label_list):
    eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test")
    # Run prediction for full data
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_dataset)
    else:
        eval_sampler = DistributedSampler(eval_dataset)  # Note that this sampler samples randomly
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    model.eval()
    preds = []
    out_label_ids = None

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids, input_ngram_ids, ngram_position_matrix, \
        ngram_lengths, ngram_seg_ids, ngram_masks = batch

        with torch.no_grad():
            logits = model(input_ids=input_ids,
                           input_ngram_ids=input_ngram_ids,
                           ngram_position_matrix=ngram_position_matrix,
                           labels=None, head_mask=None)

        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
            out_label_ids = label_ids.detach().cpu().numpy()
        else:
            preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

    preds = np.argmax(preds[0], axis=1)
    return compute_metrics(args.task_name, preds, out_label_ids) 
Example #21
Source File: run_squad_document_full_e2e.py    From RE3QA with Apache License 2.0 5 votes vote down vote up
def build_eval_data(args, eval_examples, eval_features, filtered_eval_features, filtered_rank_logits, logger):
    predict_batch_size_for_rank = 2 * args.predict_batch_size

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Num split filtered features = %d", len(filtered_eval_features))
    logger.info("Batch size for ranker = %d", predict_batch_size_for_rank)
    logger.info("Batch size for reader = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_rank_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_rank_sampler = SequentialSampler(eval_rank_data)
    else:
        eval_rank_sampler = DistributedSampler(eval_rank_data)
    eval_rank_dataloader = DataLoader(eval_rank_data, sampler=eval_rank_sampler, batch_size=predict_batch_size_for_rank)

    all_input_ids = torch.tensor([f.input_ids for f in filtered_eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in filtered_eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in filtered_eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_read_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_read_sampler = SequentialSampler(eval_read_data)
    else:
        eval_read_sampler = DistributedSampler(eval_read_data)
    eval_read_dataloader = DataLoader(eval_read_data, sampler=eval_read_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, filtered_eval_features, filtered_rank_logits, eval_rank_dataloader, \
           eval_read_dataloader 
Example #22
Source File: extract_features.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def save(args, model, tokenizer, device):
  # convert data to ids
  examples = read_examples(args.data_file, 0.09, 0.01, 50) # default numbers obtained from Linzen et al.
  
  # extract and write features
  for s_name in examples:
    s_instances = examples[s_name] 
    output_file = args.output_folder + s_name + ".json"
    features = convert_examples_to_features(s_instances, seqLen=2+get_max_seq_length(s_instances, tokenizer), tokenizer=tokenizer)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

    pbar = tqdm(total=len(s_instances)//args.batch_size)
    with open(output_file, "w", encoding='utf-8') as writer:
      for input_ids, input_mask, example_indices in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
        for b, example_index in enumerate(example_indices):
          unique_id = example_index.item()
          output_json = collections.OrderedDict()
          output_json["linex_index"] = unique_id
          verb_index = s_instances[unique_id]['verb_index']-1
          layers = []
          for layer_index in range(len(all_encoder_layers)):
            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
            layers.append([round(x.item(), 6) for x in layer_output[b][verb_index]])
          output_json["verb_layers"] = layers
          output_json["linzen_info"] = s_instances[unique_id]
          writer.write(json.dumps(output_json) + "\n")
        pbar.update(1)
    pbar.close()
    print('written features to %s'%output_file) 
Example #23
Source File: extract_features.py    From interpret_bert with GNU General Public License v3.0 5 votes vote down vote up
def save(args, model, tokenizer, device):
  # convert data to ids
  examples = read_examples(args.train_file, 3000, 500) # default number of labeled and unlabeld chunks to consider are obtained from https://aclweb.org/anthology/D18-1179
  features = convert_examples_to_features(examples=examples, seq_length=2+get_max_seq_length(examples, tokenizer), tokenizer=tokenizer)
  chunk_spans = get_chunk_spans(examples, features)
  
  # extract and write features
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
  eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
  eval_sampler = SequentialSampler(eval_data)
  eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

  pbar = tqdm(total=len(examples)//args.batch_size)
  with open(args.output_file, "w", encoding='utf-8') as writer:
    for input_ids, input_mask, example_indices in eval_dataloader:
      input_ids = input_ids.to(device)
      input_mask = input_mask.to(device)
      all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
      for b, example_index in enumerate(example_indices):
        feature_info = features[example_index.item()]
        unique_id = int(feature_info.unique_id)
        example_info, chunk_info = examples[unique_id], chunk_spans[unique_id]
        output_json = collections.OrderedDict()
        output_json["linex_index"] = unique_id
        output_json["label"] = example_info.label
        output_json["tokens"] = feature_info.tokens
        output_json["chunk_start_idx"] = chunk_info[0]
        output_json["chunk_end_idx"] = chunk_info[1]
        span_start_layers, span_end_layers = [], []
        for layer_index in range(len(all_encoder_layers)):
          layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
          span_start_layers.append([round(x.item(), 6) for x in layer_output[b][chunk_info[0]]])
          span_end_layers.append([round(x.item(), 6) for x in layer_output[b][chunk_info[1]]])
        output_json["start_layer"] = span_start_layers
        output_json["end_layer"] = span_end_layers
        writer.write(json.dumps(output_json) + "\n")
      pbar.update(1)
  pbar.close()
  print('written features to %s'%(args.output_file)) 
Example #24
Source File: data.py    From fast-bert with Apache License 2.0 5 votes vote down vote up
def get_dl_from_texts(self, texts):

        test_examples = []
        input_data = []

        for index, text in enumerate(texts):
            test_examples.append(InputExample(index, text, label=None))
            input_data.append({
                'id': index,
                'text': text
            })
        test_features = convert_examples_to_features(test_examples, label_list=self.labels,
                                                     tokenizer=self.tokenizer, max_seq_length=self.maxlen)

        all_input_ids = torch.tensor(
            [f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor(
            [f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor(
            [f.segment_ids for f in test_features], dtype=torch.long)

        test_data = TensorDataset(
            all_input_ids, all_input_mask, all_segment_ids)

        test_sampler = SequentialSampler(test_data)
        return DataLoader(test_data, sampler=test_sampler, batch_size=self.bs) 
Example #25
Source File: data_abs.py    From fast-bert with Apache License 2.0 5 votes vote down vote up
def get_dl_from_texts(self, texts):

        dataset = SummarizationInMemoryDataset(texts)

        sampler = SequentialSampler(dataset)

        collate_fn = lambda data: collate(
            data, self.tokenizer, block_size=self.max_seq_length, device=self.device
        )
        return DataLoader(
            dataset,
            sampler=sampler,
            batch_size=self.batch_size_per_gpu,
            collate_fn=collate_fn,
        ) 
Example #26
Source File: run_cls_span.py    From SpanABSA with Apache License 2.0 5 votes vote down vote up
def pipeline_eval_data(args, tokenizer, logger):
    if args.debug:
        args.predict_batch_size = 8

    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)

    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    assert args.extraction_file is not None
    eval_extract_preds = []
    extract_predictions = pickle.load(open(args.extraction_file, 'rb'))
    extract_dict = {}
    for pred in extract_predictions:
        extract_dict[pred.unique_id] = pred
    for eval_feature in eval_features:
        eval_extract_preds.append(extract_dict[eval_feature.unique_id])
    assert len(eval_extract_preds) == len(eval_features)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_span_starts = torch.tensor([f.start_indexes for f in eval_extract_preds], dtype=torch.long)
    all_span_ends = torch.tensor([f.end_indexes for f in eval_extract_preds], dtype=torch.long)
    all_label_masks = torch.tensor([f.span_masks for f in eval_extract_preds], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends,
                              all_label_masks, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader 
Example #27
Source File: run_cls_span.py    From SpanABSA with Apache License 2.0 5 votes vote down vote up
def read_eval_data(args, tokenizer, logger):
    if args.debug:
        args.predict_batch_size = 8

    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)
    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_span_starts = torch.tensor([f.start_indexes for f in eval_features], dtype=torch.long)
    all_span_ends = torch.tensor([f.end_indexes for f in eval_features], dtype=torch.long)
    all_label_masks = torch.tensor([f.label_masks for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends,
                              all_label_masks, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader 
Example #28
Source File: run_bert.py    From Bert-Multi-Label-Text-Classification with MIT License 5 votes vote down vote up
def run_test(args):
    from pybert.io.task_data import TaskData
    from pybert.test.predictor import Predictor
    data = TaskData()
    targets, sentences = data.read_data(raw_data_path=config['test_path'],
                                        preprocessor=EnglishPreProcessor(),
                                        is_train=False)
    lines = list(zip(sentences, targets))
    processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case)
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}

    test_data = processor.get_test(lines=lines)
    test_examples = processor.create_examples(lines=test_data,
                                              example_type='test',
                                              cached_examples_file=config[
                                            'data_dir'] / f"cached_test_examples_{args.arch}")
    test_features = processor.create_features(examples=test_examples,
                                              max_seq_len=args.eval_max_seq_len,
                                              cached_features_file=config[
                                            'data_dir'] / "cached_test_features_{}_{}".format(
                                                  args.eval_max_seq_len, args.arch
                                              ))
    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,
                                 collate_fn=collate_fn)
    model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list))

    # ----------- predicting
    logger.info('model predicting....')
    predictor = Predictor(model=model,
                          logger=logger,
                          n_gpu=args.n_gpu)
    result = predictor.predict(data=test_dataloader)
    print(result) 
Example #29
Source File: train.py    From subword-qac with MIT License 5 votes vote down vote up
def test(model, tokenizer, test_data, args):
    logger.info("Test starts!")
    model_load(args.model_dir, model)
    model = model.to(device)

    test_dataset = QueryDataset(test_data)
    test_data_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset),
                                  batch_size=args.bsz, num_workers=args.num_workers,
                                  collate_fn=lambda x: collate_fn(x, tokenizer, args.sample, args.max_seq_len))

    test_loss, test_str = evaluate(model, test_data_loader)
    logger.info(f"| test  | {test_str}") 
Example #30
Source File: run_xlnet.py    From Bert-Multi-Label-Text-Classification with MIT License 5 votes vote down vote up
def run_test(args):
    from pybert.io.task_data import TaskData
    from pybert.test.predictor import Predictor
    data = TaskData()
    targets, sentences = data.read_data(raw_data_path=config['test_path'],
                                        preprocessor=EnglishPreProcessor(),
                                        is_train=True)
    lines = zip(sentences, targets)
    processor = XlnetProcessor(vocab_path=config['xlnet_vocab_path'], do_lower_case=args.do_lower_case)
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}

    test_data = processor.get_test(lines=lines)
    test_examples = processor.create_examples(lines=test_data,
                                              example_type='test',
                                              cached_examples_file=config[
                                                    'data_dir'] / f"cached_test_examples_{args.arch}")
    test_features = processor.create_features(examples=test_examples,
                                              max_seq_len=args.eval_max_seq_len,
                                              cached_features_file=config[
                                                        'data_dir'] / "cached_test_features_{}_{}".format(
                                                  args.eval_max_seq_len, args.arch
                                              ))
    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,
                                 collate_fn=collate_fn)
    model = XlnetForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list))
    # ----------- predicting
    logger.info('model predicting....')
    predictor = Predictor(model=model,logger=logger,n_gpu=args.n_gpu)
    result = predictor.predict(data=test_dataloader)
    print(result)