Python utils.load_vocab() Examples

The following are 4 code examples of utils.load_vocab(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module utils , or try the search function .
Example #1
Source File: infer.py    From listen-attend-and-spell with Apache License 2.0 5 votes vote down vote up
def main(args):

    vocab_list = np.array(utils.load_vocab(args.vocab))

    vocab_size = len(vocab_list)

    config = tf.estimator.RunConfig(model_dir=args.model_dir)
    hparams = utils.create_hparams(
        args, vocab_size, utils.SOS_ID, utils.EOS_ID)

    hparams.decoder.set_hparam('beam_width', args.beam_width)

    model = tf.estimator.Estimator(
        model_fn=las_model_fn,
        config=config,
        params=hparams)

    predictions = model.predict(
        input_fn=lambda: input_fn(
            args.data, args.vocab, num_channels=args.num_channels, batch_size=args.batch_size, num_epochs=1),
        predict_keys='sample_ids')

    if args.beam_width > 0:
        predictions = [vocab_list[y['sample_ids'][:, 0]].tolist() + [utils.EOS]
                       for y in predictions]
    else:
        predictions = [vocab_list[y['sample_ids']].tolist() + [utils.EOS]
                       for y in predictions]

    predictions = [' '.join(y[:y.index(utils.EOS)]) for y in predictions]

    with open(args.save, 'w') as f:
        f.write('\n'.join(predictions)) 
Example #2
Source File: train.py    From listen-attend-and-spell with Apache License 2.0 5 votes vote down vote up
def main(args):

    vocab_list = utils.load_vocab(args.vocab)

    vocab_size = len(vocab_list)

    config = tf.estimator.RunConfig(model_dir=args.model_dir)
    hparams = utils.create_hparams(
        args, vocab_size, utils.SOS_ID, utils.EOS_ID)

    model = tf.estimator.Estimator(
        model_fn=las_model_fn,
        config=config,
        params=hparams)

    if args.valid:
        train_spec = tf.estimator.TrainSpec(
            input_fn=lambda: input_fn(
                args.train, args.vocab, num_channels=args.num_channels, batch_size=args.batch_size, num_epochs=args.num_epochs))

        eval_spec = tf.estimator.EvalSpec(
            input_fn=lambda: input_fn(
                args.valid or args.train, args.vocab, num_channels=args.num_channels, batch_size=args.batch_size),
            start_delay_secs=60,
            throttle_secs=args.eval_secs)

        tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
    else:
        model.train(
            input_fn=lambda: input_fn(
                args.train, args.vocab, num_channels=args.num_channels, batch_size=args.batch_size, num_epochs=args.num_epochs)) 
Example #3
Source File: lucene_search.py    From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path) 
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1
                 
        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close() 
Example #4
Source File: main.py    From Bert-BiLSTM-CRF-pytorch with MIT License 4 votes vote down vote up
def train(**kwargs):
    config = Config()
    config.update(**kwargs)
    print('当前设置为:\n', config)
    if config.use_cuda:
        torch.cuda.set_device(config.gpu)
    print('loading corpus')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.label_file)
    tagset_size = len(label_dic)
    train_data = read_corpus(config.train_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab)
    dev_data = read_corpus(config.dev_file, max_length=config.max_length, label_dic=label_dic, vocab=vocab)

    train_ids = torch.LongTensor([temp.input_id for temp in train_data])
    train_masks = torch.LongTensor([temp.input_mask for temp in train_data])
    train_tags = torch.LongTensor([temp.label_id for temp in train_data])

    train_dataset = TensorDataset(train_ids, train_masks, train_tags)
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size)

    dev_ids = torch.LongTensor([temp.input_id for temp in dev_data])
    dev_masks = torch.LongTensor([temp.input_mask for temp in dev_data])
    dev_tags = torch.LongTensor([temp.label_id for temp in dev_data])

    dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags)
    dev_loader = DataLoader(dev_dataset, shuffle=True, batch_size=config.batch_size)
    model = BERT_LSTM_CRF(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda)
    if config.load_model:
        assert config.load_path is not None
        model = load_model(model, name=config.load_path)
    if config.use_cuda:
        model.cuda()
    model.train()
    optimizer = getattr(optim, config.optim)
    optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
    eval_loss = 10000
    for epoch in range(config.base_epoch):
        step = 0
        for i, batch in enumerate(train_loader):
            step += 1
            model.zero_grad()
            inputs, masks, tags = batch
            inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags)
            if config.use_cuda:
                inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda()
            feats = model(inputs, masks)
            loss = model.loss(feats, masks,tags)
            loss.backward()
            optimizer.step()
            if step % 50 == 0:
                print('step: {} |  epoch: {}|  loss: {}'.format(step, epoch, loss.item()))
        loss_temp = dev(model, dev_loader, epoch, config)
        if loss_temp < eval_loss:
            save_model(model,epoch)