Python torchtext.data.BucketIterator() Examples

The following are 30 code examples of torchtext.data.BucketIterator(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torchtext.data , or try the search function .
Example #1
Source File: semantic_similar_data.py    From glyce with Apache License 2.0 6 votes vote down vote up
def __init__(self, args):
        self.RAW = data.RawField()
        self.RAW.is_target = False
        tokenize = lambda x: list(x)
        self.TEXT = data.Field(batch_first=True, tokenize=tokenize)
        self.LABEL = data.Field(sequential=False, unk_token=None)
        self.train, self.dev, self.test = data.TabularDataset.splits(
            path='/data/nfsdata/nlp/datasets/sentence_pair/bq_corpus_torch10',
            train='BQ_train.json',
            validation='BQ_dev.json',
            test='BQ_test.json',
            format='json',
            fields={"gold_label": ("label", self.LABEL),
                    "sentence1": ("q1", self.TEXT),
                    "sentence2": ("q2", self.TEXT),
                    "ID": ("id", self.RAW)})

        self.TEXT.build_vocab(self.train, self.dev, self.test, vectors=Vectors("BQ300", args.data))
        self.LABEL.build_vocab(self.train)

        sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2))
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.train_iter = data.BucketIterator(self.train, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True)
        self.dev_iter = data.BucketIterator(self.dev, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True)
        self.test_iter = data.BucketIterator(self.test, batch_size=args.batch_size, device=device, sort_key=sort_key, sort=True) 
Example #2
Source File: datasets.py    From TorchFusion with MIT License 6 votes vote down vote up
def json_data_loader(file_path,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,train=True,**args):
    """

    :param file_path:
    :param fields:
    :param split_ratio:
    :param split_seed:
    :param skip_header:
    :param save_vocab_path:
    :param batch_size:
    :param device:
    :param train:
    :param args:
    :return:
    """
    dataset = load_tabular_set(file_path,"json",fields=fields,split_ratio=split_ratio,split_seed=split_seed,skip_header=skip_header,save_vocab_path=save_vocab_path,**args)
    return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False) 
Example #3
Source File: datasets.py    From TorchFusion with MIT License 6 votes vote down vote up
def tsv_data_split_loader(root_path,fields,train=None,val=None,test=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,**args):
    """

    :param root_path:
    :param fields:
    :param train:
    :param val:
    :param test:
    :param skip_header:
    :param save_vocab_path:
    :param batch_size:
    :param device:
    :param args:
    :return:
    """
    dataset = load_tabular_set_split(root_path,"tsv",fields=fields,train=train,val=val,test=test,skip_header=skip_header,save_vocab_path=save_vocab_path,**args)
    return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False) 
Example #4
Source File: datasets.py    From TorchFusion with MIT License 6 votes vote down vote up
def tsv_data_loader(file_path,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,train=True,**args):
    """

    :param file_path:
    :param fields:
    :param split_ratio:
    :param split_seed:
    :param skip_header:
    :param save_vocab_path:
    :param batch_size:
    :param device:
    :param train:
    :param args:
    :return:
    """
    dataset = load_tabular_set(file_path,"tsv",fields=fields,split_ratio=split_ratio,split_seed=split_seed,skip_header=skip_header,save_vocab_path=save_vocab_path,**args)
    return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False) 
Example #5
Source File: datasets.py    From TorchFusion with MIT License 6 votes vote down vote up
def csv_data_split_loader(root_path,fields,train=None,val=None,test=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,**args):
    """

    :param root_path:
    :param fields:
    :param train:
    :param val:
    :param test:
    :param skip_header:
    :param save_vocab_path:
    :param batch_size:
    :param device:
    :param args:
    :return:
    """
    dataset = load_tabular_set_split(root_path,"csv",fields=fields,train=train,val=val, test=test,skip_header=skip_header,save_vocab_path=save_vocab_path,**args)
    return BucketIterator(dataset, batch_size=batch_size, device=device, train=True, shuffle=train,repeat=False) 
Example #6
Source File: iterators.py    From OpenKiwi with GNU Affero General Public License v3.0 6 votes vote down vote up
def build_bucket_iterator(dataset, device, batch_size, is_train):
    device_obj = None if device is None else torch.device(device)
    iterator = data.BucketIterator(
        dataset=dataset,
        batch_size=batch_size,
        repeat=False,
        sort_key=dataset.sort_key,
        sort=False,
        # sorts the data within each minibatch in decreasing order
        # set to true if you want use pack_padded_sequences
        sort_within_batch=is_train,
        # shuffle batches
        shuffle=is_train,
        device=device_obj,
        train=is_train,
    )
    return iterator 
Example #7
Source File: datasets.py    From TorchFusion with MIT License 6 votes vote down vote up
def csv_data_loader(file_path,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),batch_size=32,device=None,train=True,**args):
    """

    :param file_path:
    :param fields:
    :param split_ratio:
    :param split_seed:
    :param skip_header:
    :param save_vocab_path:
    :param batch_size:
    :param device:
    :param train:
    :param args:
    :return:
    """
    dataset = load_tabular_set(file_path,"csv",fields=fields,split_ratio=split_ratio,split_seed=split_seed,skip_header=skip_header,save_vocab_path=save_vocab_path,**args)
    return BucketIterator(dataset,batch_size=batch_size,device=device,train=True,shuffle=train,repeat=False) 
Example #8
Source File: train.py    From attention-is-all-you-need-pytorch with MIT License 6 votes vote down vote up
def prepare_dataloaders(opt, device):
    batch_size = opt.batch_size
    data = pickle.load(open(opt.data_pkl, 'rb'))

    opt.max_token_seq_len = data['settings'].max_len
    opt.src_pad_idx = data['vocab']['src'].vocab.stoi[Constants.PAD_WORD]
    opt.trg_pad_idx = data['vocab']['trg'].vocab.stoi[Constants.PAD_WORD]

    opt.src_vocab_size = len(data['vocab']['src'].vocab)
    opt.trg_vocab_size = len(data['vocab']['trg'].vocab)

    #========= Preparing Model =========#
    if opt.embs_share_weight:
        assert data['vocab']['src'].vocab.stoi == data['vocab']['trg'].vocab.stoi, \
            'To sharing word embedding the src/trg word2idx table shall be the same.'

    fields = {'src': data['vocab']['src'], 'trg':data['vocab']['trg']}

    train = Dataset(examples=data['train'], fields=fields)
    val = Dataset(examples=data['valid'], fields=fields)

    train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True)
    val_iterator = BucketIterator(val, batch_size=batch_size, device=device)

    return train_iterator, val_iterator 
Example #9
Source File: preprocessing_funcs.py    From NLP_Toolkit with Apache License 2.0 6 votes vote down vote up
def load_dataloaders(args):
    logger.info("Preparing dataloaders...")
    FR = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>",\
                              batch_first=True)
    EN = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, batch_first=True)
    
    train_path = os.path.join("./data/", "df.csv")
    if not os.path.isfile(train_path):
        tokenize_data(args)
    train = torchtext.data.TabularDataset(train_path, format="csv", \
                                             fields=[("EN", EN), ("FR", FR)])
    FR.build_vocab(train)
    EN.build_vocab(train)
    train_iter = BucketIterator(train, batch_size=args.batch_size, repeat=False, sort_key=lambda x: (len(x["EN"]), len(x["FR"])),\
                                shuffle=True, train=True)
    train_length = len(train)
    logger.info("Loaded dataloaders.")
    return train_iter, FR, EN, train_length 
Example #10
Source File: train.py    From aivivn-tone with MIT License 5 votes vote down vote up
def train_in_parts(self, train_parts, val, val_iterator, batch_size, start_epoch=0, print_every=100):
        for epoch in range(start_epoch, self.n_epochs):
            # shuffle data each epoch
            random.shuffle(train_parts)

            for train_src_, train_tgt_ in train_parts:
                # create train dataset
                print("Training part [{}] with target [{}]...".format(train_src_, train_tgt_))
                train_ = Seq2SeqDataset.from_file(train_src_, train_tgt_, share_fields_from=val)

                # create iterator
                train_iterator_ = BucketIterator(dataset=train_, batch_size=batch_size,
                                                 sort=False, sort_within_batch=True,
                                                 sort_key=lambda x: len(x.src),
                                                 shuffle=True, device=device)
                # train
                self._train_epoch(epoch, train_iterator_, train=True, print_every=print_every)

                # clean
                del train_
                del train_iterator_
                gc.collect()

            # save
            self.save(epoch)

            # evaluate on validation set after each epoch
            with torch.no_grad():
                self._train_epoch(epoch, val_iterator, train=False, print_every=print_every) 
Example #11
Source File: data.py    From joeynmt with Apache License 2.0 5 votes vote down vote up
def make_data_iter(dataset: Dataset,
                   batch_size: int,
                   batch_type: str = "sentence",
                   train: bool = False,
                   shuffle: bool = False) -> Iterator:
    """
    Returns a torchtext iterator for a torchtext dataset.

    :param dataset: torchtext dataset containing src and optionally trg
    :param batch_size: size of the batches the iterator prepares
    :param batch_type: measure batch size by sentence count or by token count
    :param train: whether it's training time, when turned off,
        bucketing, sorting within batches and shuffling is disabled
    :param shuffle: whether to shuffle the data before each epoch
        (no effect if set to True for testing)
    :return: torchtext iterator
    """

    batch_size_fn = token_batch_size_fn if batch_type == "token" else None

    if train:
        # optionally shuffle and sort during training
        data_iter = data.BucketIterator(
            repeat=False, sort=False, dataset=dataset,
            batch_size=batch_size, batch_size_fn=batch_size_fn,
            train=True, sort_within_batch=True,
            sort_key=lambda x: len(x.src), shuffle=shuffle)
    else:
        # don't sort/shuffle for validation/inference
        data_iter = data.BucketIterator(
            repeat=False, dataset=dataset,
            batch_size=batch_size, batch_size_fn=batch_size_fn,
            train=False, sort=False)

    return data_iter 
Example #12
Source File: iterator.py    From deepmatcher with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def create_batches(self):
        if self.sort_in_buckets:
            return data.BucketIterator.create_batches(self)
        else:
            return data.Iterator.create_batches(self) 
Example #13
Source File: train.py    From attention-is-all-you-need-pytorch with MIT License 5 votes vote down vote up
def prepare_dataloaders_from_bpe_files(opt, device):
    batch_size = opt.batch_size
    MIN_FREQ = 2
    if not opt.embs_share_weight:
        raise

    data = pickle.load(open(opt.data_pkl, 'rb'))
    MAX_LEN = data['settings'].max_len
    field = data['vocab']
    fields = (field, field)

    def filter_examples_with_length(x):
        return len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN

    train = TranslationDataset(
        fields=fields,
        path=opt.train_path, 
        exts=('.src', '.trg'),
        filter_pred=filter_examples_with_length)
    val = TranslationDataset(
        fields=fields,
        path=opt.val_path, 
        exts=('.src', '.trg'),
        filter_pred=filter_examples_with_length)

    opt.max_token_seq_len = MAX_LEN + 2
    opt.src_pad_idx = opt.trg_pad_idx = field.vocab.stoi[Constants.PAD_WORD]
    opt.src_vocab_size = opt.trg_vocab_size = len(field.vocab)

    train_iterator = BucketIterator(train, batch_size=batch_size, device=device, train=True)
    val_iterator = BucketIterator(val, batch_size=batch_size, device=device)
    return train_iterator, val_iterator 
Example #14
Source File: train.py    From aivivn-tone with MIT License 5 votes vote down vote up
def load_data(train_src, train_tgt, val_src, val_tgt, batch_size=64, save_path="checkpoint"):
    # prepare dataset
    print("Reading data...")
    train = Seq2SeqDataset.from_file(train_src, train_tgt)

    print("Building vocab...")
    train.build_vocab(max_size=300)

    val = Seq2SeqDataset.from_file(val_src, val_tgt, share_fields_from=train)

    src_vocab = train.src_field.vocab
    tgt_vocab = train.tgt_field.vocab

    # save vocab
    with open(os.path.join(save_path, "vocab.src"), "wb") as f:
        dill.dump(src_vocab, f)
    with open(os.path.join(save_path, "vocab.tgt"), "wb") as f:
        dill.dump(tgt_vocab, f)

    print("Source vocab size:", len(src_vocab))
    print("Target vocab size:", len(tgt_vocab))

    # data iterator
    # keep sort=False and shuffle=False to speed up training and reduce memory usage
    train_iterator = BucketIterator(dataset=train, batch_size=batch_size,
                                    sort=False, sort_within_batch=True,
                                    sort_key=lambda x: len(x.src),
                                    shuffle=False, device=device)
    val_iterator = BucketIterator(dataset=val, batch_size=batch_size, train=False,
                                  sort=False, sort_within_batch=True,
                                  sort_key=lambda x: len(x.src),
                                  shuffle=False, device=device)

    return src_vocab, tgt_vocab, train_iterator, val_iterator 
Example #15
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.source), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #16
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.query), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #17
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.text), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #18
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.word), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #19
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE):
        return BucketIterator(dataset, batch_size=batch_size, device=device) 
Example #20
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.text), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #21
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.text), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #22
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.text), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #23
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                 sort_key=lambda x: len(x.texta)):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key) 
Example #24
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                 sort_key=lambda x: len(x.texta)):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key) 
Example #25
Source File: train.py    From aivivn-tone with MIT License 5 votes vote down vote up
def load_data_in_parts(train_src, train_tgt, val_src, val_tgt, batch_size=64, save_path="checkpoint"):
    # prepare dataset
    print("Reading data...")
    val = Seq2SeqDataset.from_file(val_src, val_tgt)

    print("Building vocab...")
    val.build_vocab(max_size=300)

    src_vocab = val.src_field.vocab
    tgt_vocab = val.tgt_field.vocab

    # save vocab
    with open(os.path.join(save_path, "vocab.src"), "wb") as f:
        dill.dump(src_vocab, f)
    with open(os.path.join(save_path, "vocab.tgt"), "wb") as f:
        dill.dump(tgt_vocab, f)

    print("Source vocab size:", len(src_vocab))
    print("Target vocab size:", len(tgt_vocab))

    # data iterator
    # keep sort=False and shuffle=False to speed up training and reduce memory usage
    val_iterator = BucketIterator(dataset=val, batch_size=batch_size, train=False,
                                  sort=False, sort_within_batch=True,
                                  sort_key=lambda x: len(x.src),
                                  shuffle=False, device=device)

    return src_vocab, tgt_vocab, list(zip(train_src, train_tgt)), val, val_iterator, batch_size 
Example #26
Source File: tool.py    From lightKG with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.text), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #27
Source File: tool.py    From lightKG with Apache License 2.0 5 votes vote down vote up
def get_iterator(self, dataset: Dataset, batch_size=DEFAULT_CONFIG['batch_size'], device=DEVICE,
                     sort_key=lambda x: len(x.text), sort_within_batch=True):
        return BucketIterator(dataset, batch_size=batch_size, device=device, sort_key=sort_key,
                              sort_within_batch=sort_within_batch) 
Example #28
Source File: torchtext_data_loaders.py    From quick-nlp with MIT License 5 votes vote down vote up
def __init__(self, dataset: Dataset, batch_size: int, source_names: List[str], target_names: List[str],
                 sort_key: Optional[Callable] = None, **kwargs):
        self.dataset = dataset
        self.source_names = source_names
        self.target_names = target_names
        # sort by the first field if no sort key is given
        if sort_key is None:
            def sort_key(x):
                return getattr(x, self.source_names[0])
        device = None if cuda.is_available() else -1
        self.dl = BucketIterator(dataset, batch_size=batch_size, sort_key=sort_key, device=device, **kwargs)
        self.bs = batch_size
        self.iter = 0 
Example #29
Source File: train_E2E.py    From conv-emotion with MIT License 4 votes vote down vote up
def get_E2E_loaders(path, valid=0.1, batch_size=32):
    utterance = data.Field(tokenize=tokenizer, lower=True)
    label     = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token))
    id        = data.Field(use_vocab=False,sequential=False)
    fields = [('id', id),
              ('turn1', utterance),
              ('turn2', utterance),
              ('turn3', utterance),
              ('label', label)]

    train = data.TabularDataset('{}/train.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)
    valid = data.TabularDataset('{}/valid.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)

    test = data.TabularDataset('{}/test.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)
    vectors = vocab.Vectors(name='emojiplusglove.txt', cache='/media/backup/nlp-cic/DialogueRNN/')
    utterance.build_vocab(train, valid, test, vectors=vectors)
    #utterance.build_vocab(train, valid, test, vectors='glove.840B.300d')
    label.build_vocab(train)
    train_iter = BucketIterator(train,
                                  train=True,
                                  batch_size=batch_size,
                                  sort_key=lambda x: len(x.turn3),
                                  device=torch.device(0))
    valid_iter = BucketIterator(valid,
                                  batch_size=batch_size,
                                  sort_key=lambda x: len(x.turn3),
                                  device=torch.device(0))
    test_iter = BucketIterator(test,
                                  batch_size=batch_size,
                                  sort_key=lambda x: len(x.turn3),
                                  device=torch.device(0))
    return train_iter, valid_iter, test_iter,\
            utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\
            label.vocab.itos 
Example #30
Source File: predict.py    From aivivn-tone with MIT License 4 votes vote down vote up
def predict(self, test_path, test_cleaned_path, out_path):
        # read raw data to list
        lines_id = []
        lines_raw = []
        lines_cleaned = []
        lines_prep = []
        with open(test_path, 'r') as f, open(test_cleaned_path, 'r') as fc:
            for line in f:
                line_id = line[:3]
                line_seq = line[4:]
                lines_id.append(line_id)
                lines_raw.append(line_seq)
                lines_prep.append(self.preprocess(line_seq))
            for line in fc:
                lines_cleaned.append(line[4:])

        # prepare dataset
        print("Reading test data...")
        test = Seq2SeqDataset.from_list(lines_prep)
        test.src_field.vocab = self.src_vocab

        # prepare iterator
        test_iterator = BucketIterator(dataset=test, batch_size=1, train=False,
                                       sort=False, sort_within_batch=False,
                                       shuffle=False, device=device)

        # predict
        with open(out_path, 'w') as writer:
            with torch.no_grad():
                for i, batch in enumerate(test_iterator):
                    # forward through model
                    _, _, output = self.model(batch, has_targets=False, mask_softmax=1.0, teacher_forcing=1.0)

                    # get top-1
                    predicted_values, predicted_indices = torch.max(output, dim=-1)

                    # convert predicted vocab indices to an actual sentence
                    predicted_seq = [self.tgt_vocab.itos[c] for c in predicted_indices.squeeze(0).tolist()]

                    # output is log_softmax so do exp()
                    predicted_values = predicted_values.exp()

                    # convert to list
                    predicted_values_ = predicted_values.squeeze(0).tolist()

                    # beam search
                    predicted_seq = self.beam_lm(''.join(predicted_seq[1:-1]), predicted_values_[1:-1], lines_raw[i])

                    # match case and punctuations
                    predicted_seq = self.match_case(predicted_seq, lines_raw[i])

                    # do some post-processing to match submission output
                    predicted_seq = self.match_output(predicted_seq, lines_cleaned[i])
                    print("{} {}".format(i, predicted_seq))

                    # write to file with line_id
                    writer.write(lines_id[i] + ',' + predicted_seq + '\n')