Python Examples of torch.utils.data.SequentialSampler

Source File: model.py From MAX-Toxic-Comment-Classifier with Apache License 2.0

6 votes

def _pre_process(self, input):
        # Record the time spent in the prediction functions
        self.start_time = time.time()

        # Converting the input to features
        test_examples = [InputExample(guid=i, text_a=x, labels=[]) for i, x in enumerate(input)]
        test_features = convert_examples_to_features(test_examples, self.max_seq_length, self.tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)

        # Turn input examples into batches
        test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
        test_sampler = SequentialSampler(test_data)
        self.test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size)

        return test_examples

Source File: njuner.py From bert-ner with MIT License

6 votes

def _predict_features(self, features, tokens):
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        predict_data = TensorDataset(all_input_ids, all_input_mask)
        predict_sampler = SequentialSampler(predict_data)
        predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=self._batch_size)
        self._model.eval()
        predict_ids = []
        for batch in predict_dataloader:
            batch = tuple(t.to(self._device) for t in batch)
            input_ids, input_mask = batch
            logits = self._model(input_ids, input_mask)
            logits = logits.detach().cpu().numpy()
            predict_ids.extend(np.argmax(logits, -1).tolist())
        predictions = []
        for token_line, predict_line in zip(tokens, predict_ids):
            predictions.append([self._label_list[label_id] for label_id in predict_line[1: 1+len(token_line)]])
        return predictions

Source File: data_cls.py From fast-bert with Apache License 2.0

6 votes

def get_dl_from_texts(self, texts):

        test_examples = []
        input_data = []

        for index, text in enumerate(texts):
            test_examples.append(InputExample(index, text, label=None))
            input_data.append({"id": index, "text": text})

        test_dataset = self.get_dataset_from_examples(
            test_examples, "test", is_test=True, no_cache=True
        )

        test_sampler = SequentialSampler(test_dataset)
        return DataLoader(
            test_dataset, sampler=test_sampler, batch_size=self.batch_size_per_gpu
        )

Source File: run_extract_span.py From SpanABSA with Apache License 2.0

6 votes

def read_eval_data(args, tokenizer, logger):
    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)

    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader

Source File: run_joint_span.py From SpanABSA with Apache License 2.0

6 votes

def read_eval_data(args, tokenizer, logger):
    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)

    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader

Source File: data_loading.py From pytorch-lightning with Apache License 2.0

6 votes

def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader:

        # don't do anything if it's not a dataloader
        is_dataloader = isinstance(dataloader, DataLoader)
        # don't manipulate iterable datasets
        is_iterable_ds = _has_iterable_dataset(dataloader)

        if not is_dataloader or is_iterable_ds:
            return dataloader
        need_dist_sampler = (self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu)

        if self.replace_sampler_ddp and need_dist_sampler:
            if not isinstance(dataloader.sampler, (SequentialSampler, RandomSampler)):
                raise MisconfigurationException(
                    'You seem to have configured a sampler in your DataLoader. This will be replaced '
                    ' by `DistributedSampler` since `replace_sampler_ddp` is True and you are using'
                    ' distributed training. Either remove the sampler from your DataLoader or set'
                    ' `replace_sampler_ddp`=False if you want to use your custom sampler.')

            # replace with distributed sampler
            sampler = self._get_distributed_sampler(dataloader)
            dataloader = self.replace_sampler(dataloader, sampler)

        return dataloader

Source File: base_task.py From Doc2EDAG with MIT License

6 votes

def prepare_data_loader(self, dataset, batch_size, rand_flag=True):
        # prepare data loader
        if rand_flag:
            data_sampler = RandomSampler(dataset)
        else:
            data_sampler = SequentialSampler(dataset)

        if self.custom_collate_fn is None:
            dataloader = DataLoader(dataset,
                                    batch_size=batch_size,
                                    sampler=data_sampler)
        else:
            dataloader = DataLoader(dataset,
                                    batch_size=batch_size,
                                    sampler=data_sampler,
                                    collate_fn=self.custom_collate_fn)

        return dataloader

Source File: bert.py From BERT-SQuAD with GNU Affero General Public License v3.0

5 votes

def predict(self,passage :str,question :str):
        example = input_to_squad_example(passage,question)
        features = squad_examples_to_features(example,self.tokenizer,self.max_seq_length,self.doc_stride,self.max_query_length)
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_example_index)
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)
        all_results = []
        for batch in eval_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                        'attention_mask': batch[1],
                        'token_type_ids': batch[2]  
                        }
                example_indices = batch[3]
                outputs = self.model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                result = RawResult(unique_id    = unique_id,
                                    start_logits = to_list(outputs[0][i]),
                                    end_logits   = to_list(outputs[1][i]))
                all_results.append(result)
        answer = get_answer(example,features,all_results,self.n_best_size,self.max_answer_length,self.do_lower_case)
        return answer

Source File: run_albert.py From Bert-Multi-Label-Text-Classification with MIT License

5 votes

def run_test(args):
    from pybert.io.task_data import TaskData
    from pybert.test.predictor import Predictor
    data = TaskData()
    targets, sentences = data.read_data(raw_data_path=config['test_path'],
                                        preprocessor=EnglishPreProcessor(),
                                        is_train=False)
    lines = list(zip(sentences, targets))
    processor = AlbertProcessor(spm_model_file=config['albert_vocab_path'], do_lower_case=args.do_lower_case,
                                vocab_file=None)
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}

    test_data = processor.get_test(lines=lines)
    test_examples = processor.create_examples(lines=test_data,
                                              example_type='test',
                                              cached_examples_file=config[
                                             'data_dir'] / f"cached_test_examples_{args.arch}")
    test_features = processor.create_features(examples=test_examples,
                                              max_seq_len=args.eval_max_seq_len,
                                              cached_features_file=config[
                                              'data_dir'] / "cached_test_features_{}_{}".format(
                                                  args.eval_max_seq_len, args.arch
                                              ))
    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,
                                 collate_fn=collate_fn)
    model = AlbertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list))

    # ----------- predicting
    logger.info('model predicting....')
    predictor = Predictor(model=model,logger=logger,n_gpu=args.n_gpu)
    result = predictor.predict(data=test_dataloader)
    print(result)

Source File: dataloader.py From nonechucks with MIT License

5 votes

def _replace_default_samplers(cls):
        cls.sequential = data.dataloader.SequentialSampler
        cls.random = data.dataloader.RandomSampler

        def safe_sampler_callable(sampler_cls, dataset):
            return SafeSampler(dataset, sampler_cls(dataset))

        data.dataloader.SequentialSampler = partial(
            safe_sampler_callable, data.SequentialSampler
        )
        data.dataloader.RandomSampler = partial(
            safe_sampler_callable, data.RandomSampler
        )

Source File: post.py From denspi with Apache License 2.0

5 votes

def convert_question_features_to_dataloader(query_eval_features, fp16, local_rank, predict_batch_size):
    all_input_ids_ = torch.tensor([f.input_ids for f in query_eval_features], dtype=torch.long)
    all_input_mask_ = torch.tensor([f.input_mask for f in query_eval_features], dtype=torch.long)
    all_example_index_ = torch.arange(all_input_ids_.size(0), dtype=torch.long)
    if fp16:
        all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_))

    question_data = TensorDataset(all_input_ids_, all_input_mask_, all_example_index_)

    if local_rank == -1:
        question_sampler = SequentialSampler(question_data)
    else:
        question_sampler = DistributedSampler(question_data)
    question_dataloader = DataLoader(question_data, sampler=question_sampler, batch_size=predict_batch_size)
    return question_dataloader

Source File: induce_dep_trees.py From interpret_bert with GNU General Public License v3.0

5 votes

def save(args, model, tokenizer, device):
  # convert data to ids
  examples = [args.sentence_text]
  features = convert_examples_to_features(
        examples=examples, seq_length=2+get_max_seq_length(examples, 
                                                           tokenizer),
        tokenizer=tokenizer)

  # extract and write dependency parses
  all_input_ids = torch.tensor([f.input_ids for f in features], 
                               dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], 
                              dtype=torch.long)
  all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
  eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
  eval_sampler = SequentialSampler(eval_data)
  eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, 
                             batch_size=args.batch_size)
  for input_ids, input_mask, example_indices in eval_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    all_encoder_layers, pooled_layer, raw_attn_layers = model(input_ids, 
                                                      token_type_ids=None,
                                                 attention_mask=input_mask)
    cur_tokens = features[example_indices[0]].tokens[1:-1]
    cur_layer = raw_attn_layers[args.layer_id-1].squeeze()
    cur_head = cur_layer[args.head_id-1]
    cur_attn_matrix = cur_head[0:len(cur_tokens)+1, 0:len(cur_tokens)+1].detach().cpu().numpy()
    cur_attn_matrix[:,0] = -1.
    cur_attn_matrix[args.sentence_root,0] = 1.0
    np.fill_diagonal(cur_attn_matrix, -1.)
    mst_out = mst(cur_attn_matrix)
    tokens = ['<root>'] + cur_tokens
    print('tokens ==>')
    print(tokens)
    print('heads ==>')
    print([tokens[head_id] for head_id in mst_out])
    break

Source File: dataloader.py From nonechucks with MIT License

5 votes

def _restore_default_samplers(cls):
        data.dataloader.SequentialSampler = cls.sequential
        data.dataloader.RandomSampler = cls.random

Source File: dcca.py From mvlearn with Apache License 2.0

5 votes

def _get_outputs(self, x1, x2):
        """
        Private function to get the transformed data and the corresponding
        loss for the given inputs.

        Parameters
        ----------
        x1 : torch.tensor
            Input view 1 data.
        x2 : torch.tensor
            Input view 2 data.

        Returns
        -------
        losses : list
            List of losses for each batch taken from the input data.
        outputs : list of tensors
            outputs[i] is the output of the deep models for view i.
        """
        with torch.no_grad():
            self.model_.eval()
            data_size = x1.size(0)
            batch_idxs = list(BatchSampler(SequentialSampler(range(data_size)),
                              batch_size=self.batch_size_,
                              drop_last=False))
            losses = []
            outputs1 = []
            outputs2 = []
            for batch_idx in batch_idxs:
                batch_x1 = x1[batch_idx, :]
                batch_x2 = x2[batch_idx, :]
                o1, o2 = self.model_(batch_x1, batch_x2)
                outputs1.append(o1)
                outputs2.append(o2)
                loss = self.loss_(o1, o2)
                losses.append(loss.item())
        outputs = [torch.cat(outputs1, dim=0).cpu().numpy(),
                   torch.cat(outputs2, dim=0).cpu().numpy()]

        return losses, outputs

Source File: sequence_classification.py From nlp-architect with Apache License 2.0

5 votes

def inference(
        self,
        examples: List[SequenceClsInputExample],
        max_seq_length: int,
        batch_size: int = 64,
        evaluate=False,
    ):
        """
        Run inference on given examples

        Args:
            examples (List[SequenceClsInputExample]): examples
            batch_size (int, optional): batch size. Defaults to 64.

        Returns:
            logits
        """
        data_set = self.convert_to_tensors(
            examples, max_seq_length=max_seq_length, include_labels=evaluate
        )
        inf_sampler = SequentialSampler(data_set)
        inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size)
        logits = self._evaluate(inf_dataloader)
        if not evaluate:
            preds = self._postprocess_logits(logits)
        else:
            logits, label_ids = logits
            preds = self._postprocess_logits(logits)
            self.evaluate_predictions(logits, label_ids)
        return preds

Source File: token_classification.py From nlp-architect with Apache License 2.0

5 votes

def inference(
        self, examples: List[TokenClsInputExample], max_seq_length: int, batch_size: int = 64
    ):
        """
        Run inference on given examples

        Args:
            examples (List[SequenceClsInputExample]): examples
            batch_size (int, optional): batch size. Defaults to 64.

        Returns:
            logits
        """
        data_set = self.convert_to_tensors(
            examples, max_seq_length=max_seq_length, include_labels=False
        )
        inf_sampler = SequentialSampler(data_set)
        inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size)
        logits = self._evaluate(inf_dataloader)
        active_positions = data_set.tensors[-1].view(len(data_set), -1) != 0.0
        logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
        res_ids = []
        for i in range(logits.size()[0]):
            res_ids.append(logits[i][active_positions[i]].detach().cpu().numpy())
        output = []
        for tag_ids, ex in zip(res_ids, examples):
            tokens = ex.tokens
            tags = [self.labels_id_map.get(t, "O") for t in tag_ids]
            output.append((tokens, tags))
        return output

Source File: tagging.py From nlp-architect with Apache License 2.0

5 votes

def inference(self, examples: List[TokenClsInputExample], batch_size: int = 64):
        """
        Do inference on given examples

        Args:
            examples (List[TokenClsInputExample]): examples
            batch_size (int, optional): batch size. Defaults to 64.

        Returns:
            List(tuple): a list of tuples of tokens, tags predicted by model
        """
        data_set = self.convert_to_tensors(examples, include_labels=False)
        inf_sampler = SequentialSampler(data_set)
        inf_dataloader = DataLoader(data_set, sampler=inf_sampler, batch_size=batch_size)
        logits = self.evaluate(inf_dataloader)
        active_positions = data_set.tensors[-1].view(len(data_set), -1) != 0.0
        logits = torch.argmax(F.log_softmax(logits[0], dim=2), dim=2)
        res_ids = []
        for i in range(logits.size()[0]):
            res_ids.append(logits[i][active_positions[i]].detach().cpu().numpy())
        output = []
        for tag_ids, ex in zip(res_ids, examples):
            tokens = ex.tokens
            tags = [self.label_id_str.get(t, "O") for t in tag_ids]
            output.append((tokens, tags))
        return output

Source File: pytorch_utils.py From nlp-recipes with MIT License

5 votes

def dataloader_from_dataset(
    ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False
):
    """Creates a PyTorch DataLoader given a Dataset object.

    Args:
        ds (torch.utils.data.DataSet): A PyTorch dataset.
        batch_size (int, optional): Batch size.
            If more than 1 gpu is used, this would be the batch size per gpu.
            Defaults to 32.
        num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
        shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
        distributed (book, optional): If True, a DistributedSampler is used.
        Defaults to False.

    Returns:
        Module, DataParallel: A PyTorch Module or
            a DataParallel wrapper (when multiple gpus are used).
    """
    if num_gpus is None:
        num_gpus = torch.cuda.device_count()

    batch_size = batch_size * max(1, num_gpus)

    if distributed:
        sampler = DistributedSampler(ds)
    else:
        sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)

    return DataLoader(ds, sampler=sampler, batch_size=batch_size)

Source File: run_hnn.py From mt-dnn with MIT License

5 votes

def run_predict(args, model, device, test_data, prefix=None):
  # Run prediction for full data
  eval_results=OrderedDict()
  for test_item in test_data:
    torch.cuda.empty_cache()
    name = test_item.name
    test_sampler = SequentialSampler(test_item.data)
    test_dataloader = SequentialDataLoader(test_item.data, sampler=test_sampler, batch_size=args.predict_batch_size, num_workers = args.worker_num)
    model.eval()
    sm_predicts=[]
    lm_predicts=[]
    en_predicts=[]
    for input_ids,tids,_ in tqdm(test_dataloader, ncols=80, desc='Predicting: {}'.format(prefix)):
      with torch.no_grad():
        sm_logits, lm_logits, en_logits,_ = model(input_ids, tids)
      if sm_logits is not None:
        sm_predicts.append(sm_logits.detach().cpu().numpy())
      if lm_logits is not None:
        lm_predicts.append(lm_logits.detach().cpu().numpy())
      if en_logits is not None:
        en_predicts.append(en_logits.detach().cpu().numpy())
    def pred(predicts, tag):
      output_test_file = os.path.join(args.output_dir, "test_results_{}_{}_{}.txt".format(name, prefix, tag))
      logger.info("***** Dump prediction results-{}-{}-{} *****".format(name, prefix, tag))
      logger.info("Location: {}".format(output_test_file))
      np.savetxt(output_test_file, predicts, delimiter='\t')
      predict_fn = test_item.predict_fn
      if predict_fn:
        predict_fn(predicts, args.output_dir, name, prefix, tag=tag)
    if len(lm_predicts)>0:
      lm_predicts = np.concatenate(lm_predicts, axis=0)
      pred(lm_predicts, 'LM-')

    if len(sm_predicts)>0:
      sm_predicts = np.concatenate(sm_predicts, axis=0)
      pred(sm_predicts, 'SIM-')

    if len(en_predicts)>0:
      en_predicts = np.concatenate(en_predicts, axis=0)
      pred(en_predicts, 'EN-Avg-')

Source File: run_sequence_level_classification.py From ZEN with Apache License 2.0

5 votes

def evaluate(args, model, tokenizer, ngram_dict, processor, label_list):
    eval_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="test")
    # Run prediction for full data
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_dataset)
    else:
        eval_sampler = DistributedSampler(eval_dataset)  # Note that this sampler samples randomly
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    model.eval()
    preds = []
    out_label_ids = None

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        batch = tuple(t.to(args.device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids, input_ngram_ids, ngram_position_matrix, \
        ngram_lengths, ngram_seg_ids, ngram_masks = batch

        with torch.no_grad():
            logits = model(input_ids=input_ids,
                           input_ngram_ids=input_ngram_ids,
                           ngram_position_matrix=ngram_position_matrix,
                           labels=None, head_mask=None)

        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
            out_label_ids = label_ids.detach().cpu().numpy()
        else:
            preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

    preds = np.argmax(preds[0], axis=1)
    return compute_metrics(args.task_name, preds, out_label_ids)

Source File: run_squad_document_full_e2e.py From RE3QA with Apache License 2.0

5 votes

def build_eval_data(args, eval_examples, eval_features, filtered_eval_features, filtered_rank_logits, logger):
    predict_batch_size_for_rank = 2 * args.predict_batch_size

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Num split filtered features = %d", len(filtered_eval_features))
    logger.info("Batch size for ranker = %d", predict_batch_size_for_rank)
    logger.info("Batch size for reader = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_rank_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_rank_sampler = SequentialSampler(eval_rank_data)
    else:
        eval_rank_sampler = DistributedSampler(eval_rank_data)
    eval_rank_dataloader = DataLoader(eval_rank_data, sampler=eval_rank_sampler, batch_size=predict_batch_size_for_rank)

    all_input_ids = torch.tensor([f.input_ids for f in filtered_eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in filtered_eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in filtered_eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_read_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
    if args.local_rank == -1:
        eval_read_sampler = SequentialSampler(eval_read_data)
    else:
        eval_read_sampler = DistributedSampler(eval_read_data)
    eval_read_dataloader = DataLoader(eval_read_data, sampler=eval_read_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, filtered_eval_features, filtered_rank_logits, eval_rank_dataloader, \
           eval_read_dataloader

Source File: extract_features.py From interpret_bert with GNU General Public License v3.0

5 votes

def save(args, model, tokenizer, device):
  # convert data to ids
  examples = read_examples(args.data_file, 0.09, 0.01, 50) # default numbers obtained from Linzen et al.
  
  # extract and write features
  for s_name in examples:
    s_instances = examples[s_name] 
    output_file = args.output_folder + s_name + ".json"
    features = convert_examples_to_features(s_instances, seqLen=2+get_max_seq_length(s_instances, tokenizer), tokenizer=tokenizer)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

    pbar = tqdm(total=len(s_instances)//args.batch_size)
    with open(output_file, "w", encoding='utf-8') as writer:
      for input_ids, input_mask, example_indices in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
        for b, example_index in enumerate(example_indices):
          unique_id = example_index.item()
          output_json = collections.OrderedDict()
          output_json["linex_index"] = unique_id
          verb_index = s_instances[unique_id]['verb_index']-1
          layers = []
          for layer_index in range(len(all_encoder_layers)):
            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
            layers.append([round(x.item(), 6) for x in layer_output[b][verb_index]])
          output_json["verb_layers"] = layers
          output_json["linzen_info"] = s_instances[unique_id]
          writer.write(json.dumps(output_json) + "\n")
        pbar.update(1)
    pbar.close()
    print('written features to %s'%output_file)

Source File: extract_features.py From interpret_bert with GNU General Public License v3.0

5 votes

def save(args, model, tokenizer, device):
  # convert data to ids
  examples = read_examples(args.train_file, 3000, 500) # default number of labeled and unlabeld chunks to consider are obtained from https://aclweb.org/anthology/D18-1179
  features = convert_examples_to_features(examples=examples, seq_length=2+get_max_seq_length(examples, tokenizer), tokenizer=tokenizer)
  chunk_spans = get_chunk_spans(examples, features)
  
  # extract and write features
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
  all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
  eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
  eval_sampler = SequentialSampler(eval_data)
  eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

  pbar = tqdm(total=len(examples)//args.batch_size)
  with open(args.output_file, "w", encoding='utf-8') as writer:
    for input_ids, input_mask, example_indices in eval_dataloader:
      input_ids = input_ids.to(device)
      input_mask = input_mask.to(device)
      all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
      for b, example_index in enumerate(example_indices):
        feature_info = features[example_index.item()]
        unique_id = int(feature_info.unique_id)
        example_info, chunk_info = examples[unique_id], chunk_spans[unique_id]
        output_json = collections.OrderedDict()
        output_json["linex_index"] = unique_id
        output_json["label"] = example_info.label
        output_json["tokens"] = feature_info.tokens
        output_json["chunk_start_idx"] = chunk_info[0]
        output_json["chunk_end_idx"] = chunk_info[1]
        span_start_layers, span_end_layers = [], []
        for layer_index in range(len(all_encoder_layers)):
          layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
          span_start_layers.append([round(x.item(), 6) for x in layer_output[b][chunk_info[0]]])
          span_end_layers.append([round(x.item(), 6) for x in layer_output[b][chunk_info[1]]])
        output_json["start_layer"] = span_start_layers
        output_json["end_layer"] = span_end_layers
        writer.write(json.dumps(output_json) + "\n")
      pbar.update(1)
  pbar.close()
  print('written features to %s'%(args.output_file))

Source File: data.py From fast-bert with Apache License 2.0

5 votes

def get_dl_from_texts(self, texts):

        test_examples = []
        input_data = []

        for index, text in enumerate(texts):
            test_examples.append(InputExample(index, text, label=None))
            input_data.append({
                'id': index,
                'text': text
            })
        test_features = convert_examples_to_features(test_examples, label_list=self.labels,
                                                     tokenizer=self.tokenizer, max_seq_length=self.maxlen)

        all_input_ids = torch.tensor(
            [f.input_ids for f in test_features], dtype=torch.long)
        all_input_mask = torch.tensor(
            [f.input_mask for f in test_features], dtype=torch.long)
        all_segment_ids = torch.tensor(
            [f.segment_ids for f in test_features], dtype=torch.long)

        test_data = TensorDataset(
            all_input_ids, all_input_mask, all_segment_ids)

        test_sampler = SequentialSampler(test_data)
        return DataLoader(test_data, sampler=test_sampler, batch_size=self.bs)

Source File: data_abs.py From fast-bert with Apache License 2.0

5 votes

def get_dl_from_texts(self, texts):

        dataset = SummarizationInMemoryDataset(texts)

        sampler = SequentialSampler(dataset)

        collate_fn = lambda data: collate(
            data, self.tokenizer, block_size=self.max_seq_length, device=self.device
        )
        return DataLoader(
            dataset,
            sampler=sampler,
            batch_size=self.batch_size_per_gpu,
            collate_fn=collate_fn,
        )

Source File: run_cls_span.py From SpanABSA with Apache License 2.0

5 votes

def pipeline_eval_data(args, tokenizer, logger):
    if args.debug:
        args.predict_batch_size = 8

    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)

    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    assert args.extraction_file is not None
    eval_extract_preds = []
    extract_predictions = pickle.load(open(args.extraction_file, 'rb'))
    extract_dict = {}
    for pred in extract_predictions:
        extract_dict[pred.unique_id] = pred
    for eval_feature in eval_features:
        eval_extract_preds.append(extract_dict[eval_feature.unique_id])
    assert len(eval_extract_preds) == len(eval_features)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_span_starts = torch.tensor([f.start_indexes for f in eval_extract_preds], dtype=torch.long)
    all_span_ends = torch.tensor([f.end_indexes for f in eval_extract_preds], dtype=torch.long)
    all_label_masks = torch.tensor([f.span_masks for f in eval_extract_preds], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends,
                              all_label_masks, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader

Source File: run_cls_span.py From SpanABSA with Apache License 2.0

5 votes

def read_eval_data(args, tokenizer, logger):
    if args.debug:
        args.predict_batch_size = 8

    eval_path = os.path.join(args.data_dir, args.predict_file)
    eval_set = read_absa_data(eval_path)
    eval_examples = convert_absa_data(dataset=eval_set, verbose_logging=args.verbose_logging)
    eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
                                                 args.verbose_logging, logger)

    logger.info("Num orig examples = %d", len(eval_examples))
    logger.info("Num split features = %d", len(eval_features))
    logger.info("Batch size = %d", args.predict_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_span_starts = torch.tensor([f.start_indexes for f in eval_features], dtype=torch.long)
    all_span_ends = torch.tensor([f.end_indexes for f in eval_features], dtype=torch.long)
    all_label_masks = torch.tensor([f.label_masks for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_span_starts, all_span_ends,
                              all_label_masks, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
    return eval_examples, eval_features, eval_dataloader

Source File: run_bert.py From Bert-Multi-Label-Text-Classification with MIT License

5 votes

def run_test(args):
    from pybert.io.task_data import TaskData
    from pybert.test.predictor import Predictor
    data = TaskData()
    targets, sentences = data.read_data(raw_data_path=config['test_path'],
                                        preprocessor=EnglishPreProcessor(),
                                        is_train=False)
    lines = list(zip(sentences, targets))
    processor = BertProcessor(vocab_path=config['bert_vocab_path'], do_lower_case=args.do_lower_case)
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}

    test_data = processor.get_test(lines=lines)
    test_examples = processor.create_examples(lines=test_data,
                                              example_type='test',
                                              cached_examples_file=config[
                                            'data_dir'] / f"cached_test_examples_{args.arch}")
    test_features = processor.create_features(examples=test_examples,
                                              max_seq_len=args.eval_max_seq_len,
                                              cached_features_file=config[
                                            'data_dir'] / "cached_test_features_{}_{}".format(
                                                  args.eval_max_seq_len, args.arch
                                              ))
    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,
                                 collate_fn=collate_fn)
    model = BertForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list))

    # ----------- predicting
    logger.info('model predicting....')
    predictor = Predictor(model=model,
                          logger=logger,
                          n_gpu=args.n_gpu)
    result = predictor.predict(data=test_dataloader)
    print(result)

Source File: train.py From subword-qac with MIT License

5 votes

def test(model, tokenizer, test_data, args):
    logger.info("Test starts!")
    model_load(args.model_dir, model)
    model = model.to(device)

    test_dataset = QueryDataset(test_data)
    test_data_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset),
                                  batch_size=args.bsz, num_workers=args.num_workers,
                                  collate_fn=lambda x: collate_fn(x, tokenizer, args.sample, args.max_seq_len))

    test_loss, test_str = evaluate(model, test_data_loader)
    logger.info(f"| test  | {test_str}")

Source File: run_xlnet.py From Bert-Multi-Label-Text-Classification with MIT License

5 votes

def run_test(args):
    from pybert.io.task_data import TaskData
    from pybert.test.predictor import Predictor
    data = TaskData()
    targets, sentences = data.read_data(raw_data_path=config['test_path'],
                                        preprocessor=EnglishPreProcessor(),
                                        is_train=True)
    lines = zip(sentences, targets)
    processor = XlnetProcessor(vocab_path=config['xlnet_vocab_path'], do_lower_case=args.do_lower_case)
    label_list = processor.get_labels()
    id2label = {i: label for i, label in enumerate(label_list)}

    test_data = processor.get_test(lines=lines)
    test_examples = processor.create_examples(lines=test_data,
                                              example_type='test',
                                              cached_examples_file=config[
                                                    'data_dir'] / f"cached_test_examples_{args.arch}")
    test_features = processor.create_features(examples=test_examples,
                                              max_seq_len=args.eval_max_seq_len,
                                              cached_features_file=config[
                                                        'data_dir'] / "cached_test_features_{}_{}".format(
                                                  args.eval_max_seq_len, args.arch
                                              ))
    test_dataset = processor.create_dataset(test_features)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.train_batch_size,
                                 collate_fn=collate_fn)
    model = XlnetForMultiLable.from_pretrained(config['checkpoint_dir'], num_labels=len(label_list))
    # ----------- predicting
    logger.info('model predicting....')
    predictor = Predictor(model=model,logger=logger,n_gpu=args.n_gpu)
    result = predictor.predict(data=test_dataloader)
    print(result)

Python torch.utils.data.SequentialSampler() Examples