Python fuel.schemes.ConstantScheme() Examples

The following are 22 code examples of fuel.schemes.ConstantScheme(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module fuel.schemes , or try the search function .
Example #1
Source File: data.py    From DeepMind-Teaching-Machines-to-Read-and-Comprehend with MIT License 6 votes vote down vote up
def setup_datastream(path, vocab_file, config):
    ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question)
    it = QAIterator(path, shuffle=config.shuffle_questions)

    stream = DataStream(ds, iteration_scheme=it)

    if config.concat_ctx_and_question:
        stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>'])

    # Sort sets of multiple batches to make batches of similar sizes
    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('question' if config.concat_ctx_and_question else 'context'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size))
    stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32')

    return ds, stream 
Example #2
Source File: timit.py    From CTC-LSTM with Apache License 2.0 6 votes vote down vote up
def setup_datastream(path, batch_size, sort_batch_count, valid=False):
    A = numpy.load(os.path.join(path, ('valid_x_raw.npy' if valid else 'train_x_raw.npy')))
    B = numpy.load(os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy')))
    C = numpy.load(os.path.join(path, ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy')))

    D = [B[x[0]:x[1], 2] for x in C]

    ds = IndexableDataset({'input': A, 'output': D})
    stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A)))

    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('input'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size, num_examples=len(A)))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream 
Example #3
Source File: keyphrase_copynet.py    From seq2seq-keyphrase with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
    data_stream = dataset.get_example_stream()
    data_stream = transformers.Batch(data_stream,
                                     iteration_scheme=schemes.ConstantScheme(batch_size))

    # add padding and masks to the dataset
    # Warning: in multiple output case, will raise ValueError: All dimensions except length must be equal, need padding manually
    # data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
    # data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
    return data_stream 
Example #4
Source File: build_dataset.py    From CopyNet with MIT License 5 votes vote down vote up
def obtain_stream(dataset, batch_size, size=1):
    if size == 1:
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('data'))
        return data_stream
    else:
        data_streams = [dataset.get_example_stream() for _ in xrange(size)]
        data_streams = [transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
                        for data_stream in data_streams]
        data_streams = [transformers.Padding(data_stream, mask_sources=('data')) for data_stream in data_streams]
        return data_streams 
Example #5
Source File: copynet.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
        return data_stream 
Example #6
Source File: syntest.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
        return data_stream 
Example #7
Source File: lcsts_vest.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
        return data_stream 
Example #8
Source File: lcsts_test.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target', 'target_c'))
        return data_stream 
Example #9
Source File: syn_vest.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
        return data_stream 
Example #10
Source File: bst_vest.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
        return data_stream 
Example #11
Source File: weibo_vest.py    From CopyNet with MIT License 5 votes vote down vote up
def output_stream(dataset, batch_size, size=1):
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream,
                                         iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('source', 'target'))
        return data_stream 
Example #12
Source File: __init__.py    From blocks-examples with MIT License 5 votes vote down vote up
def get_data_stream(iterable):
    """Returns a 'fuel.Batch' datastream of
    [x~input~numbers, y~targets~roots], with each iteration returning a
    batch of 20 training examples
    """
    numbers = numpy.asarray(iterable, dtype=floatX)
    dataset = IterableDataset(
        {'numbers': numbers, 'roots': numpy.sqrt(numbers)})
    return Batch(dataset.get_example_stream(), ConstantScheme(20)) 
Example #13
Source File: test_datasets.py    From fuel with MIT License 5 votes vote down vote up
def test_example_iteration_scheme(self):
        scheme = ConstantScheme(2)

        class MinimalDataset(Dataset):
            provides_sources = ('data',)
            _example_iteration_scheme = scheme

            def get_data(self, state=None, request=None):
                pass

        assert MinimalDataset().example_iteration_scheme is scheme 
Example #14
Source File: toy_dataset.py    From CTC-LSTM with Apache License 2.0 5 votes vote down vote up
def setup_datastream(batch_size, **kwargs):
    ds = ToyDataset(**kwargs)
    stream = DataStream(ds, iteration_scheme=SequentialExampleScheme(kwargs['nb_examples']))

    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream 
Example #15
Source File: test_datasets.py    From attention-lvcsr with MIT License 5 votes vote down vote up
def test_example_iteration_scheme(self):
        scheme = ConstantScheme(2)

        class MinimalDataset(Dataset):
            provides_sources = ('data',)
            _example_iteration_scheme = scheme

            def get_data(self, state=None, request=None):
                pass

        assert MinimalDataset().example_iteration_scheme is scheme 
Example #16
Source File: build_dataset.py    From seq2seq-keyphrase with MIT License 5 votes vote down vote up
def obtain_stream(dataset, batch_size, size=1):
    if size == 1:
        data_stream = dataset.get_example_stream()
        data_stream = transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))

        # add padding and masks to the dataset
        data_stream = transformers.Padding(data_stream, mask_sources=('data'))
        return data_stream
    else:
        data_streams = [dataset.get_example_stream() for _ in range(size)]
        data_streams = [transformers.Batch(data_stream, iteration_scheme=schemes.ConstantScheme(batch_size))
                        for data_stream in data_streams]
        data_streams = [transformers.Padding(data_stream, mask_sources=('data')) for data_stream in data_streams]
        return data_streams 
Example #17
Source File: stream.py    From blocks-examples with MIT License 4 votes vote down vote up
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream 
Example #18
Source File: test_datasets.py    From fuel with MIT License 4 votes vote down vote up
def test_data_driven_epochs():
    class TestDataset(IterableDataset):
        sources = ('data',)

        def __init__(self):
            self.axis_labels = None
            self.data = [[1, 2, 3, 4],
                         [5, 6, 7, 8]]

        def open(self):
            epoch_iter = iter(self.data)
            data_iter = iter(next(epoch_iter))
            return (epoch_iter, data_iter)

        def next_epoch(self, state):
            try:
                data_iter = iter(next(state[0]))
                return (state[0], data_iter)
            except StopIteration:
                return self.open()

        def get_data(self, state, request):
            data = []
            for i in range(request):
                data.append(next(state[1]))
            return (data,)

    epochs = []
    epochs.append([([1],), ([2],), ([3],), ([4],)])
    epochs.append([([5],), ([6],), ([7],), ([8],)])
    stream = DataStream(TestDataset(), iteration_scheme=ConstantScheme(1))
    assert list(stream.get_epoch_iterator()) == epochs[0]
    assert list(stream.get_epoch_iterator()) == epochs[1]
    assert list(stream.get_epoch_iterator()) == epochs[0]

    stream.reset()
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == epochs[i]

    # test scheme resetting between epochs
    class TestScheme(BatchSizeScheme):

        def get_request_iterator(self):
            return iter([1, 2, 1, 3])

    epochs = []
    epochs.append([([1],), ([2, 3],), ([4],)])
    epochs.append([([5],), ([6, 7],), ([8],)])
    stream = DataStream(TestDataset(), iteration_scheme=TestScheme())
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == epochs[i] 
Example #19
Source File: stream.py    From dl4mt-multi with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_log_prob_stream(cg, config):
    eid, did = p_(cg)
    dataset = config['log_prob_sets'][cg]

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocab = cPickle.load(open(config['src_vocabs'][eid]))
    src_vocab['<S>'] = 0
    src_vocab['</S>'] = config['src_eos_idxs'][eid]
    src_vocab['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocab = cPickle.load(open(config['trg_vocabs'][did]))
    trg_vocab['<S>'] = 0
    trg_vocab['</S>'] = config['trg_eos_idxs'][did]
    trg_vocab['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    logger.info('Building logprob stream for cg:[{}]'.format(cg))
    src_dataset = TextFile([dataset[0]], src_vocab, None)
    trg_dataset = TextFile([dataset[1]], trg_vocab, None)
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    stream = Mapping(stream, _oov_to_unk(
                     src_vocab_size=config['src_vocab_sizes'][eid],
                     trg_vocab_size=config['trg_vocab_sizes'][did],
                     unk_id=config['unk_id']))
    bs = 100
    if 'log_prob_bs' in config:
        if isinstance(config['log_prob_bs'], dict):
            bs = config['log_prob_bs'][cg]
        else:
            bs = config['log_prob_bs']
    stream = Batch(
        stream,
        iteration_scheme=ConstantScheme(
            bs, num_examples=get_num_lines(dataset[0])))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream, _remapWordIdx(
            [(0, 0, config['src_eos_idxs'][eid]),
             (2, 0, config['trg_eos_idxs'][did])]))

    return masked_stream 
Example #20
Source File: stream.py    From dl4mt-multi with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_logprob_streams(config):
    if 'log_prob_sets' not in config:
        return None

    cgs = config['cgs']
    enc_ids, dec_ids = get_enc_dec_ids(cgs)
    datasets = config['log_prob_sets']

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocabs = {k: cPickle.load(open(v))
                  for k, v in config['src_vocabs'].iteritems()}
    for k in src_vocabs.keys():
        src_vocabs[k]['<S>'] = 0
        src_vocabs[k]['</S>'] = config['src_eos_idxs'][k]
        src_vocabs[k]['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocabs = {k: cPickle.load(open(v))
                  for k, v in config['trg_vocabs'].iteritems()}
    for k in trg_vocabs.keys():
        trg_vocabs[k]['<S>'] = 0
        trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k]
        trg_vocabs[k]['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    ind_streams = {}
    for cg in cgs:
        eid, did = p_(cg)
        if cg not in datasets:
            continue
        logger.info('Building logprob stream for cg:[{}]'.format(cg))
        src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None)
        stream = Merge([src_dataset.get_example_stream(),
                        trg_dataset.get_example_stream()],
                       ('source', 'target'))

        stream = Mapping(stream, _oov_to_unk(
                         src_vocab_size=config['src_vocab_sizes'][eid],
                         trg_vocab_size=config['trg_vocab_sizes'][did],
                         unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(
                bs, num_examples=get_num_lines(datasets[cg][0])))

        masked_stream = Padding(stream)
        masked_stream = Mapping(
            masked_stream, _remapWordIdx(
                [(0, 0, config['src_eos_idxs'][eid]),
                 (2, 0, config['trg_eos_idxs'][did])]))
        ind_streams[cg] = masked_stream

    return ind_streams 
Example #21
Source File: stream.py    From dl4mt-multi with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None,
                       is_training=True, src_vocabs=None, trg_vocabs=None,
                       logprob_datasets=None):
    eid, did = p_(cg)
    if is_training:
        logger.info(' ... src:[{}] - [{}]'.format(
            eid, src_datasets[cg].files[0]))
        logger.info(' ... trg:[{}] - [{}]'.format(
            did, trg_datasets[cg].files[0]))
        stream = Merge([src_datasets[cg].get_example_stream(),
                        trg_datasets[cg].get_example_stream()],
                       ('source', 'target'))
        stream = Filter(stream, predicate=_too_long(config['src_seq_len'],
                                                    config['tgt_seq_len']))

        if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0:
            stream = Filter(stream,
                            predicate=_too_short(config['min_seq_lens'][cg]))

        stream = Mapping(stream, _oov_to_unk(
                         src_vocab_size=config['src_vocab_sizes'][eid],
                         trg_vocab_size=config['trg_vocab_sizes'][did],
                         unk_id=config['unk_id']))
        stream = Batch(
            stream, iteration_scheme=ConstantScheme(
                config['batch_sizes'][cg]*config['sort_k_batches']))

        stream = Mapping(stream, SortMapping(_length))
        stream = Unpack(stream)
        stream = Batch(stream, iteration_scheme=ConstantScheme(
            config['batch_sizes'][cg]))
    else:  # logprob stream
        src_dataset = TextFile([logprob_datasets[cg][0]],
                               src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([logprob_datasets[cg][1]],
                               trg_vocabs[p_(cg)[1]], None)
        stream = Merge([src_dataset.get_example_stream(),
                        trg_dataset.get_example_stream()],
                       ('source', 'target'))
        stream = Mapping(stream, _oov_to_unk(
                         src_vocab_size=config['src_vocab_sizes'][eid],
                         trg_vocab_size=config['trg_vocab_sizes'][did],
                         unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']
        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream, _remapWordIdx(
            [(0, 0, config['src_eos_idxs'][eid]),
             (2, 0, config['trg_eos_idxs'][did])]))
    return masked_stream 
Example #22
Source File: test_datasets.py    From attention-lvcsr with MIT License 4 votes vote down vote up
def test_data_driven_epochs():
    class TestDataset(IterableDataset):
        sources = ('data',)

        def __init__(self):
            self.axis_labels = None
            self.data = [[1, 2, 3, 4],
                         [5, 6, 7, 8]]

        def open(self):
            epoch_iter = iter(self.data)
            data_iter = iter(next(epoch_iter))
            return (epoch_iter, data_iter)

        def next_epoch(self, state):
            try:
                data_iter = iter(next(state[0]))
                return (state[0], data_iter)
            except StopIteration:
                return self.open()

        def get_data(self, state, request):
            data = []
            for i in range(request):
                data.append(next(state[1]))
            return (data,)

    epochs = []
    epochs.append([([1],), ([2],), ([3],), ([4],)])
    epochs.append([([5],), ([6],), ([7],), ([8],)])
    stream = DataStream(TestDataset(), iteration_scheme=ConstantScheme(1))
    assert list(stream.get_epoch_iterator()) == epochs[0]
    assert list(stream.get_epoch_iterator()) == epochs[1]
    assert list(stream.get_epoch_iterator()) == epochs[0]

    stream.reset()
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == epochs[i]

    # test scheme resetting between epochs
    class TestScheme(BatchSizeScheme):

        def get_request_iterator(self):
            return iter([1, 2, 1, 3])

    epochs = []
    epochs.append([([1],), ([2, 3],), ([4],)])
    epochs.append([([5],), ([6, 7],), ([8],)])
    stream = DataStream(TestDataset(), iteration_scheme=TestScheme())
    for i, epoch in zip(range(2), stream.iterate_epochs()):
        assert list(epoch) == epochs[i]