Python data.article2ids() Examples
The following are 6
code examples of data.article2ids().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
data
, or try the search function
.
Example #1
Source File: batcher.py From Reinforce-Paraphrase-Generation with MIT License | 5 votes |
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids # NOTE: dec_input does not contain article OOV ids!!!! _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
Example #2
Source File: batcher.py From pointer_summarizer with Apache License 2.0 | 5 votes |
def __init__(self, article, abstract_sentences, vocab): # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > config.max_enc_steps: article_words = article_words[:config.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, config.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if config.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, config.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
Example #3
Source File: batcher.py From TransferRL with MIT License | 4 votes |
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
Example #4
Source File: batcher.py From RLSeq2Seq with MIT License | 4 votes |
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
Example #5
Source File: batcher.py From MAX-Text-Summarizer with Apache License 2.0 | 4 votes |
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences
Example #6
Source File: batcher.py From pointer-generator with Apache License 2.0 | 4 votes |
def __init__(self, article, abstract_sentences, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_len = len(article_words) # store the length after truncation but before padding self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.original_abstract = abstract self.original_abstract_sents = abstract_sentences