Python allennlp.common.util.END_SYMBOL Examples
The following are 15
code examples of allennlp.common.util.END_SYMBOL().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.common.util
, or try the search function
.
Example #1
Source File: api.py From rupo with Apache License 2.0 | 6 votes |
def get_generator(self, model_path: str, token_vocab_path: str, stress_vocab_dump_path: str) -> Generator: if self.generator is None: assert os.path.isdir(model_path) and os.path.isdir(token_vocab_path) vocabulary = Vocabulary.from_files(token_vocab_path) stress_vocabulary = StressVocabulary() if not os.path.isfile(stress_vocab_dump_path): stress_vocabulary = inflate_stress_vocabulary(vocabulary, self.get_stress_predictor()) stress_vocabulary.save(stress_vocab_dump_path) else: stress_vocabulary.load(stress_vocab_dump_path) eos_index = vocabulary.get_token_index(END_SYMBOL) unk_index = vocabulary.get_token_index(DEFAULT_OOV_TOKEN) exclude_transform = ExcludeTransform((unk_index, eos_index)) model = LanguageModel.load(model_path, vocabulary_dir=token_vocab_path, transforms=[exclude_transform, ]) self.generator = Generator(model, vocabulary, stress_vocabulary, eos_index) return self.generator
Example #2
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35 ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance
Example #3
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35, classification_mode=False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance self._classification_mode = classification_mode
Example #4
Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0 | 6 votes |
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance: if sentences is None: if self._language == "ru": sentences = [s.text for s in razdel.sentenize(text)] else: sentences = nltk.tokenize.sent_tokenize(text) sentences_tokens = [] for sentence in sentences[:self._max_sentences_count]: sentence = sentence.lower() if self._lowercase else sentence tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) indexed_tokens = TextField(tokens, self._source_token_indexers) sentences_tokens.append(indexed_tokens) sentences_tokens_indexed = ListField(sentences_tokens) result = {'source_sentences': sentences_tokens_indexed} if tags: result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed) return Instance(result)
Example #5
Source File: pytorch_misc.py From HGL-pytorch with MIT License | 5 votes |
def detokenize(array, vocab): """ Given an array of ints, we'll turn this into a string or a list of strings. :param array: possibly multidimensional numpy array :return: """ if array.ndim > 1: return [detokenize(x, vocab) for x in array] tokenized = [vocab.get_token_from_index(v) for v in array] return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)])
Example #6
Source File: seq2seq.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, source_string , target_string = None) : # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({u"source_tokens": source_field, u"target_tokens": target_field}) else: return Instance({u'source_tokens': source_field})
Example #7
Source File: simple_seq2seq.py From magnitude with MIT License | 5 votes |
def __init__(self, vocab , source_embedder , encoder , max_decoding_steps , target_namespace = u"tokens", target_embedding_dim = None, attention_function = None, scheduled_sampling_ratio = 0.0) : super(SimpleSeq2Seq, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = LegacyAttention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) #overrides
Example #8
Source File: pytorch_misc.py From r2c with MIT License | 5 votes |
def detokenize(array, vocab): """ Given an array of ints, we'll turn this into a string or a list of strings. :param array: possibly multidimensional numpy array :return: """ if array.ndim > 1: return [detokenize(x, vocab) for x in array] tokenized = [vocab.get_token_from_index(v) for v in array] return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)])
Example #9
Source File: ir_labeled_tuple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = LabelField(int(query_id), skip_indexing=True) doc_id_field = LabelField(int(doc_id), skip_indexing=True) query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) #doc_tokenized.insert(0, Token(START_SYMBOL)) #doc_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] doc_field = TextField(doc_tokenized, self._token_indexers) query_length = LabelField(len(query_tokenized), skip_indexing=True) doc_length = LabelField(len(doc_tokenized), skip_indexing=True) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field, "query_length":query_length, "doc_length":doc_length})
Example #10
Source File: ir_triple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence) #doc_pos_tokenized.insert(0, Token(START_SYMBOL)) #doc_pos_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length] doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers) doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence) #doc_neg_tokenized.insert(0, Token(START_SYMBOL)) #doc_neg_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length] doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers) query_length = LabelField(len(query_tokenized), skip_indexing=True) doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True) doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True) return Instance({ "query_tokens":query_field, "doc_pos_tokens":doc_pos_field, "doc_neg_tokens": doc_neg_field, "query_length":query_length, "doc_pos_length":doc_pos_length, "doc_neg_length":doc_neg_length})
Example #11
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = MetadataField(query_id) doc_id_field = MetadataField(doc_id) query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length: query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized)) query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) #doc_tokenized.insert(0, Token(START_SYMBOL)) #doc_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length: doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized)) doc_field = TextField(doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field})
Example #12
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_cnn_dailymail_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False) dataset = reader.read(TEST_URLS_FILE) for sample in dataset: self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["source_tokens"]), 2) self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["target_tokens"]), 2)
Example #13
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_ria_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = RIAReader(tokenizer) dataset = reader.read(RIA_EXAMPLE_FILE) for sample in dataset: self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["source_tokens"]), 2) self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["target_tokens"]), 2)
Example #14
Source File: custom_autoregressive_seq2seq_decoder.py From summarus with Apache License 2.0 | 4 votes |
def __init__( self, vocab: Vocabulary, decoder_net: DecoderNet, max_decoding_steps: int, target_embedder: Embedding, target_namespace: str = "tokens", tie_output_embedding: bool = False, scheduled_sampling_ratio: float = 0, label_smoothing_ratio: Optional[float] = None, beam_size: int = 4, tensor_based_metric: Metric = None, token_based_metric: Metric = None, ) -> None: super().__init__(target_embedder) self._vocab = vocab self._decoder_net = decoder_net self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._label_smoothing_ratio = label_smoothing_ratio self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace) self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) target_vocab_size = self._vocab.get_vocab_size(self._target_namespace) if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim: raise ConfigurationError("Target Embedder output_dim doesn't match decoder module's input.") self._output_projection_layer = Linear(self._decoder_net.get_output_dim(), target_vocab_size) if tie_output_embedding: if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape: raise ConfigurationError("Can't tie embeddings with output linear layer, due to shape mismatch") self._output_projection_layer.weight = self.target_embedder.weight self._tensor_based_metric = tensor_based_metric self._token_based_metric = token_based_metric self._scheduled_sampling_ratio = scheduled_sampling_ratio
Example #15
Source File: summarization_reader.py From summarus with Apache License 2.0 | 4 votes |
def text_to_instance(self, source: str, target: str = None) -> Instance: def prepare_text(text, max_tokens): text = text.lower() if self._lowercase else text tokens = self._tokenizer.tokenize(text)[:max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) return tokens source_tokens = prepare_text(source, self._source_max_tokens) source_tokens_indexed = TextField(source_tokens, self._source_token_indexers) result = {'source_tokens': source_tokens_indexed} meta_fields = {} if self._save_copy_fields: source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]] if self._save_pgn_fields: source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens] if target: target_tokens = prepare_text(target, self._target_max_tokens) target_tokens_indexed = TextField(target_tokens, self._target_token_indexers) result['target_tokens'] = target_tokens_indexed if self._save_pgn_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens] source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len(source_tokens)] result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long')) target_token_ids = source_and_target_token_ids[len(source_tokens):] result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long')) if self._save_copy_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]] source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len(source_tokens)-2] result["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(source_tokens)-2:] result["target_token_ids"] = ArrayField(np.array(target_token_ids)) elif self._save_copy_fields: source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) elif self._save_pgn_fields: source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) if self._save_copy_fields or self._save_pgn_fields: result["metadata"] = MetadataField(meta_fields) return Instance(result)