Python allennlp.common.util.START_SYMBOL Examples
The following are 18
code examples of allennlp.common.util.START_SYMBOL().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.common.util
, or try the search function
.
Example #1
Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0 | 6 votes |
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance: if sentences is None: if self._language == "ru": sentences = [s.text for s in razdel.sentenize(text)] else: sentences = nltk.tokenize.sent_tokenize(text) sentences_tokens = [] for sentence in sentences[:self._max_sentences_count]: sentence = sentence.lower() if self._lowercase else sentence tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) indexed_tokens = TextField(tokens, self._source_token_indexers) sentences_tokens.append(indexed_tokens) sentences_tokens_indexed = ListField(sentences_tokens) result = {'source_sentences': sentences_tokens_indexed} if tags: result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed) return Instance(result)
Example #2
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35, classification_mode=False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance self._classification_mode = classification_mode
Example #3
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35 ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance
Example #4
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_ria_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = RIAReader(tokenizer) dataset = reader.read(RIA_EXAMPLE_FILE) for sample in dataset: self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["source_tokens"]), 2) self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["target_tokens"]), 2)
Example #5
Source File: pytorch_misc.py From HGL-pytorch with MIT License | 5 votes |
def detokenize(array, vocab): """ Given an array of ints, we'll turn this into a string or a list of strings. :param array: possibly multidimensional numpy array :return: """ if array.ndim > 1: return [detokenize(x, vocab) for x in array] tokenized = [vocab.get_token_from_index(v) for v in array] return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)])
Example #6
Source File: test_readers.py From summarus with Apache License 2.0 | 5 votes |
def test_cnn_dailymail_reader(self): tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter()) reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False) dataset = reader.read(TEST_URLS_FILE) for sample in dataset: self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["source_tokens"]), 2) self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL) self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL) self.assertGreater(len(sample.fields["target_tokens"]), 2)
Example #7
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = MetadataField(query_id) doc_id_field = MetadataField(doc_id) query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length: query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized)) query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) #doc_tokenized.insert(0, Token(START_SYMBOL)) #doc_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length: doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized)) doc_field = TextField(doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field})
Example #8
Source File: ir_triple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence) #doc_pos_tokenized.insert(0, Token(START_SYMBOL)) #doc_pos_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length] doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers) doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence) #doc_neg_tokenized.insert(0, Token(START_SYMBOL)) #doc_neg_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length] doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers) query_length = LabelField(len(query_tokenized), skip_indexing=True) doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True) doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True) return Instance({ "query_tokens":query_field, "doc_pos_tokens":doc_pos_field, "doc_neg_tokens": doc_neg_field, "query_length":query_length, "doc_pos_length":doc_pos_length, "doc_neg_length":doc_neg_length})
Example #9
Source File: ir_labeled_tuple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = LabelField(int(query_id), skip_indexing=True) doc_id_field = LabelField(int(doc_id), skip_indexing=True) query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) #doc_tokenized.insert(0, Token(START_SYMBOL)) #doc_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] doc_field = TextField(doc_tokenized, self._token_indexers) query_length = LabelField(len(query_tokenized), skip_indexing=True) doc_length = LabelField(len(doc_tokenized), skip_indexing=True) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field, "query_length":query_length, "doc_length":doc_length})
Example #10
Source File: domain_language.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def get_nonterminal_productions(self) -> Dict[str, List[str]]: """ Induces a grammar from the defined collection of predicates in this language and returns all productions in that grammar, keyed by the non-terminal they are expanding. This includes terminal productions implied by each predicate as well as productions for the `return type` of each defined predicate. For example, defining a "multiply" predicate adds a "<int,int:int> -> multiply" terminal production to the grammar, and `also` a "int -> [<int,int:int>, int, int]" non-terminal production, because I can use the "multiply" predicate to produce an int. """ if not self._nonterminal_productions: actions: Dict[str, Set[str]] = defaultdict(set) # If you didn't give us a set of valid start types, we'll assume all types we know # about (including functional types) are valid start types. if self._start_types: start_types = self._start_types else: start_types = set() for type_list in self._function_types.values(): start_types.update(type_list) for start_type in start_types: actions[START_SYMBOL].add(f"{START_SYMBOL} -> {start_type}") for name, function_type_list in self._function_types.items(): for function_type in function_type_list: actions[str(function_type)].add(f"{function_type} -> {name}") if isinstance(function_type, FunctionType): return_type = function_type.return_type arg_types = function_type.argument_types right_side = f"[{function_type}, {', '.join(str(arg_type) for arg_type in arg_types)}]" actions[str(return_type)].add(f"{return_type} -> {right_side}") self._nonterminal_productions = {key: sorted(value) for key, value in actions.items()} return self._nonterminal_productions
Example #11
Source File: type_declaration.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def __str__(self): # TODO (pradeep): This limits the number of basic types we can have to 26. We may want to # change this in the future if we extend to domains where we have more than 26 basic types. if self._string_rep == START_SYMBOL: return START_SYMBOL else: return self._string_rep.lower()[0]
Example #12
Source File: pytorch_misc.py From r2c with MIT License | 5 votes |
def detokenize(array, vocab): """ Given an array of ints, we'll turn this into a string or a list of strings. :param array: possibly multidimensional numpy array :return: """ if array.ndim > 1: return [detokenize(x, vocab) for x in array] tokenized = [vocab.get_token_from_index(v) for v in array] return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)])
Example #13
Source File: simple_seq2seq.py From magnitude with MIT License | 5 votes |
def __init__(self, vocab , source_embedder , encoder , max_decoding_steps , target_namespace = u"tokens", target_embedding_dim = None, attention_function = None, scheduled_sampling_ratio = 0.0) : super(SimpleSeq2Seq, self).__init__(vocab) self._source_embedder = source_embedder self._encoder = encoder self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._attention_function = attention_function self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) num_classes = self.vocab.get_vocab_size(self._target_namespace) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with that of the final hidden states of the encoder. Also, if # we're using attention with ``DotProductSimilarity``, this is needed. self._decoder_output_dim = self._encoder.get_output_dim() target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim() self._target_embedder = Embedding(num_classes, target_embedding_dim) if self._attention_function: self._decoder_attention = LegacyAttention(self._attention_function) # The output of attention, a weighted average over encoder outputs, will be # concatenated to the input vector of the decoder at each time step. self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim else: self._decoder_input_dim = target_embedding_dim # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._output_projection_layer = Linear(self._decoder_output_dim, num_classes) #overrides
Example #14
Source File: type_declaration.py From magnitude with MIT License | 5 votes |
def __str__(self): # TODO (pradeep): This limits the number of basic types we can have to 26. We may want to # change this in the future if we extend to domains where we have more than 26 basic types. if self._string_rep == START_SYMBOL: return START_SYMBOL else: return self._string_rep.lower()[0]
Example #15
Source File: seq2seq.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, source_string , target_string = None) : # type: ignore # pylint: disable=arguments-differ tokenized_source = self._source_tokenizer.tokenize(source_string) if self._source_add_start_token: tokenized_source.insert(0, Token(START_SYMBOL)) tokenized_source.append(Token(END_SYMBOL)) source_field = TextField(tokenized_source, self._source_token_indexers) if target_string is not None: tokenized_target = self._target_tokenizer.tokenize(target_string) tokenized_target.insert(0, Token(START_SYMBOL)) tokenized_target.append(Token(END_SYMBOL)) target_field = TextField(tokenized_target, self._target_token_indexers) return Instance({u"source_tokens": source_field, u"target_tokens": target_field}) else: return Instance({u'source_tokens': source_field})
Example #16
Source File: custom_autoregressive_seq2seq_decoder.py From summarus with Apache License 2.0 | 4 votes |
def __init__( self, vocab: Vocabulary, decoder_net: DecoderNet, max_decoding_steps: int, target_embedder: Embedding, target_namespace: str = "tokens", tie_output_embedding: bool = False, scheduled_sampling_ratio: float = 0, label_smoothing_ratio: Optional[float] = None, beam_size: int = 4, tensor_based_metric: Metric = None, token_based_metric: Metric = None, ) -> None: super().__init__(target_embedder) self._vocab = vocab self._decoder_net = decoder_net self._max_decoding_steps = max_decoding_steps self._target_namespace = target_namespace self._label_smoothing_ratio = label_smoothing_ratio self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace) self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) target_vocab_size = self._vocab.get_vocab_size(self._target_namespace) if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim: raise ConfigurationError("Target Embedder output_dim doesn't match decoder module's input.") self._output_projection_layer = Linear(self._decoder_net.get_output_dim(), target_vocab_size) if tie_output_embedding: if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape: raise ConfigurationError("Can't tie embeddings with output linear layer, due to shape mismatch") self._output_projection_layer.weight = self.target_embedder.weight self._tensor_based_metric = tensor_based_metric self._token_based_metric = token_based_metric self._scheduled_sampling_ratio = scheduled_sampling_ratio
Example #17
Source File: summarization_reader.py From summarus with Apache License 2.0 | 4 votes |
def text_to_instance(self, source: str, target: str = None) -> Instance: def prepare_text(text, max_tokens): text = text.lower() if self._lowercase else text tokens = self._tokenizer.tokenize(text)[:max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) return tokens source_tokens = prepare_text(source, self._source_max_tokens) source_tokens_indexed = TextField(source_tokens, self._source_token_indexers) result = {'source_tokens': source_tokens_indexed} meta_fields = {} if self._save_copy_fields: source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]] if self._save_pgn_fields: source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens] if target: target_tokens = prepare_text(target, self._target_max_tokens) target_tokens_indexed = TextField(target_tokens, self._target_token_indexers) result['target_tokens'] = target_tokens_indexed if self._save_pgn_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens] source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len(source_tokens)] result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long')) target_token_ids = source_and_target_token_ids[len(source_tokens):] result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long')) if self._save_copy_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]] source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len(source_tokens)-2] result["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(source_tokens)-2:] result["target_token_ids"] = ArrayField(np.array(target_token_ids)) elif self._save_copy_fields: source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) elif self._save_pgn_fields: source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) if self._save_copy_fields or self._save_pgn_fields: result["metadata"] = MetadataField(meta_fields) return Instance(result)
Example #18
Source File: action_space_walker.py From magnitude with MIT License | 4 votes |
def _walk(self) : u""" Walk over action space to collect completed paths of at most ``self._max_path_length`` steps. """ # Buffer of NTs to expand, previous actions incomplete_paths = [([unicode(type_)], ["{START_SYMBOL} -> {type_}"]) for type_ in self._world.get_valid_starting_types()] self._completed_paths = [] actions = self._world.get_valid_actions() # Overview: We keep track of the buffer of non-terminals to expand, and the action history # for each incomplete path. At every iteration in the while loop below, we iterate over all # incomplete paths, expand one non-terminal from the buffer in a depth-first fashion, get # all possible next actions triggered by that non-terminal and add to the paths. Then, we # check the expanded paths, to see if they are 1) complete, in which case they are # added to completed_paths, 2) longer than max_path_length, in which case they are # discarded, or 3) neither, in which case they are used to form the incomplete_paths for the # next iteration of this while loop. # While the non-terminal expansion is done in a depth-first fashion, note that the search over # the action space itself is breadth-first. while incomplete_paths: next_paths = [] for nonterminal_buffer, history in incomplete_paths: # Taking the last non-terminal added to the buffer. We're going depth-first. nonterminal = nonterminal_buffer.pop() # Iterating over all possible next actions. for action in actions[nonterminal]: new_history = history + [action] new_nonterminal_buffer = nonterminal_buffer[:] # Since we expand the last action added to the buffer, the left child should be # added after the right child. for right_side_part in reversed(self._get_right_side_parts(action)): if types.is_nonterminal(right_side_part): new_nonterminal_buffer.append(right_side_part) next_paths.append((new_nonterminal_buffer, new_history)) incomplete_paths = [] for nonterminal_buffer, path in next_paths: # An empty buffer means that we've completed this path. if not nonterminal_buffer: # Indexing completed paths by the nonterminals they contain. next_path_index = len(self._completed_paths) for action in path: for value in self._get_right_side_parts(action): if not types.is_nonterminal(value): self._terminal_path_index[action].add(next_path_index) self._completed_paths.append(path) # We're adding to incomplete_paths for the next iteration, only those paths that are # shorter than the max_path_length. The remaining paths will be discarded. elif len(path) <= self._max_path_length: incomplete_paths.append((nonterminal_buffer, path))