Python allennlp.common.util.START_SYMBOL Examples

The following are 18 code examples of allennlp.common.util.START_SYMBOL(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.common.util , or try the search function .
Example #1
Source File: summarization_sentence_tagger_reader.py    From summarus with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
        if sentences is None:
            if self._language == "ru":
                sentences = [s.text for s in razdel.sentenize(text)]
            else:
                sentences = nltk.tokenize.sent_tokenize(text)
        sentences_tokens = []
        for sentence in sentences[:self._max_sentences_count]:
            sentence = sentence.lower() if self._lowercase else sentence
            tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            indexed_tokens = TextField(tokens, self._source_token_indexers)
            sentences_tokens.append(indexed_tokens)

        sentences_tokens_indexed = ListField(sentences_tokens)
        result = {'source_sentences': sentences_tokens_indexed}

        if tags:
            result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
        return Instance(result) 
Example #2
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35,
                 classification_mode=False
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance
        self._classification_mode = classification_mode 
Example #3
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance 
Example #4
Source File: test_readers.py    From summarus with Apache License 2.0 5 votes vote down vote up
def test_ria_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = RIAReader(tokenizer)
        dataset = reader.read(RIA_EXAMPLE_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
Example #5
Source File: pytorch_misc.py    From HGL-pytorch with MIT License 5 votes vote down vote up
def detokenize(array, vocab):
    """
    Given an array of ints, we'll turn this into a string or a list of strings.
    :param array: possibly multidimensional numpy array
    :return:
    """
    if array.ndim > 1:
        return [detokenize(x, vocab) for x in array]
    tokenized = [vocab.get_token_from_index(v) for v in array]
    return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)]) 
Example #6
Source File: test_readers.py    From summarus with Apache License 2.0 5 votes vote down vote up
def test_cnn_dailymail_reader(self):
        tokenizer = WordTokenizer(word_splitter=SimpleWordSplitter())
        reader = CNNDailyMailReader(tokenizer, cnn_tokenized_dir=TEST_STORIES_DIR, separate_namespaces=False)
        dataset = reader.read(TEST_URLS_FILE)
        for sample in dataset:
            self.assertEqual(sample.fields["source_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["source_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["source_tokens"]), 2)

            self.assertEqual(sample.fields["target_tokens"][0].text, START_SYMBOL)
            self.assertEqual(sample.fields["target_tokens"][-1].text, END_SYMBOL)
            self.assertGreater(len(sample.fields["target_tokens"]), 2) 
Example #7
Source File: ir_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = MetadataField(query_id)
        doc_id_field = MetadataField(doc_id)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]
        if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length:
            query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized))

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]
        if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length:
            doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized))

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field}) 
Example #8
Source File: ir_triple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        #doc_pos_tokenized.insert(0, Token(START_SYMBOL))
        #doc_pos_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        #doc_neg_tokenized.insert(0, Token(START_SYMBOL))
        #doc_neg_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True)
        doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field,
            "query_length":query_length,
            "doc_pos_length":doc_pos_length,
            "doc_neg_length":doc_neg_length}) 
Example #9
Source File: ir_labeled_tuple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_length = LabelField(len(doc_tokenized), skip_indexing=True)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field,
            "query_length":query_length,
            "doc_length":doc_length}) 
Example #10
Source File: domain_language.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def get_nonterminal_productions(self) -> Dict[str, List[str]]:
        """
        Induces a grammar from the defined collection of predicates in this language and returns
        all productions in that grammar, keyed by the non-terminal they are expanding.

        This includes terminal productions implied by each predicate as well as productions for the
        `return type` of each defined predicate.  For example, defining a "multiply" predicate adds
        a "<int,int:int> -> multiply" terminal production to the grammar, and `also` a "int ->
        [<int,int:int>, int, int]" non-terminal production, because I can use the "multiply"
        predicate to produce an int.
        """
        if not self._nonterminal_productions:
            actions: Dict[str, Set[str]] = defaultdict(set)
            # If you didn't give us a set of valid start types, we'll assume all types we know
            # about (including functional types) are valid start types.
            if self._start_types:
                start_types = self._start_types
            else:
                start_types = set()
                for type_list in self._function_types.values():
                    start_types.update(type_list)
            for start_type in start_types:
                actions[START_SYMBOL].add(f"{START_SYMBOL} -> {start_type}")
            for name, function_type_list in self._function_types.items():
                for function_type in function_type_list:
                    actions[str(function_type)].add(f"{function_type} -> {name}")
                    if isinstance(function_type, FunctionType):
                        return_type = function_type.return_type
                        arg_types = function_type.argument_types
                        right_side = f"[{function_type}, {', '.join(str(arg_type) for arg_type in arg_types)}]"
                        actions[str(return_type)].add(f"{return_type} -> {right_side}")
            self._nonterminal_productions = {key: sorted(value) for key, value in actions.items()}
        return self._nonterminal_productions 
Example #11
Source File: type_declaration.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def __str__(self):
        # TODO (pradeep): This limits the number of basic types we can have to 26. We may want to
        # change this in the future if we extend to domains where we have more than 26 basic types.
        if self._string_rep == START_SYMBOL:
            return START_SYMBOL
        else:
            return self._string_rep.lower()[0] 
Example #12
Source File: pytorch_misc.py    From r2c with MIT License 5 votes vote down vote up
def detokenize(array, vocab):
    """
    Given an array of ints, we'll turn this into a string or a list of strings.
    :param array: possibly multidimensional numpy array
    :return:
    """
    if array.ndim > 1:
        return [detokenize(x, vocab) for x in array]
    tokenized = [vocab.get_token_from_index(v) for v in array]
    return ' '.join([x for x in tokenized if x not in (vocab._padding_token, START_SYMBOL, END_SYMBOL)]) 
Example #13
Source File: simple_seq2seq.py    From magnitude with MIT License 5 votes vote down vote up
def __init__(self,
                 vocab            ,
                 source_embedder                   ,
                 encoder                ,
                 max_decoding_steps     ,
                 target_namespace      = u"tokens",
                 target_embedding_dim      = None,
                 attention_function                     = None,
                 scheduled_sampling_ratio        = 0.0)        :
        super(SimpleSeq2Seq, self).__init__(vocab)
        self._source_embedder = source_embedder
        self._encoder = encoder
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._attention_function = attention_function
        self._scheduled_sampling_ratio = scheduled_sampling_ratio
        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)
        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._decoder_output_dim = self._encoder.get_output_dim()
        target_embedding_dim = target_embedding_dim or self._source_embedder.get_output_dim()
        self._target_embedder = Embedding(num_classes, target_embedding_dim)
        if self._attention_function:
            self._decoder_attention = LegacyAttention(self._attention_function)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time step.
            self._decoder_input_dim = self._encoder.get_output_dim() + target_embedding_dim
        else:
            self._decoder_input_dim = target_embedding_dim
        # TODO (pradeep): Do not hardcode decoder cell type.
        self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim)
        self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)

    #overrides 
Example #14
Source File: type_declaration.py    From magnitude with MIT License 5 votes vote down vote up
def __str__(self):
        # TODO (pradeep): This limits the number of basic types we can have to 26. We may want to
        # change this in the future if we extend to domains where we have more than 26 basic types.
        if self._string_rep == START_SYMBOL:
            return START_SYMBOL
        else:
            return self._string_rep.lower()[0] 
Example #15
Source File: seq2seq.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, source_string     , target_string      = None)            :  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if self._source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)
        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)
            return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
        else:
            return Instance({u'source_tokens': source_field}) 
Example #16
Source File: custom_autoregressive_seq2seq_decoder.py    From summarus with Apache License 2.0 4 votes vote down vote up
def __init__(
            self,
            vocab: Vocabulary,
            decoder_net: DecoderNet,
            max_decoding_steps: int,
            target_embedder: Embedding,
            target_namespace: str = "tokens",
            tie_output_embedding: bool = False,
            scheduled_sampling_ratio: float = 0,
            label_smoothing_ratio: Optional[float] = None,
            beam_size: int = 4,
            tensor_based_metric: Metric = None,
            token_based_metric: Metric = None,
    ) -> None:
        super().__init__(target_embedder)

        self._vocab = vocab

        self._decoder_net = decoder_net
        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace
        self._label_smoothing_ratio = label_smoothing_ratio

        self._start_index = self._vocab.get_token_index(START_SYMBOL, self._target_namespace)
        self._end_index = self._vocab.get_token_index(END_SYMBOL, self._target_namespace)
        self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size)

        target_vocab_size = self._vocab.get_vocab_size(self._target_namespace)

        if self.target_embedder.get_output_dim() != self._decoder_net.target_embedding_dim:
            raise ConfigurationError("Target Embedder output_dim doesn't match decoder module's input.")

        self._output_projection_layer = Linear(self._decoder_net.get_output_dim(), target_vocab_size)

        if tie_output_embedding:
            if self._output_projection_layer.weight.shape != self.target_embedder.weight.shape:
                raise ConfigurationError("Can't tie embeddings with output linear layer, due to shape mismatch")
            self._output_projection_layer.weight = self.target_embedder.weight

        self._tensor_based_metric = tensor_based_metric
        self._token_based_metric = token_based_metric
        self._scheduled_sampling_ratio = scheduled_sampling_ratio 
Example #17
Source File: summarization_reader.py    From summarus with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self, source: str, target: str = None) -> Instance:
        def prepare_text(text, max_tokens):
            text = text.lower() if self._lowercase else text
            tokens = self._tokenizer.tokenize(text)[:max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            return tokens

        source_tokens = prepare_text(source, self._source_max_tokens)
        source_tokens_indexed = TextField(source_tokens, self._source_token_indexers)
        result = {'source_tokens': source_tokens_indexed}
        meta_fields = {}

        if self._save_copy_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]]

        if self._save_pgn_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens]

        if target:
            target_tokens = prepare_text(target, self._target_max_tokens)
            target_tokens_indexed = TextField(target_tokens, self._target_token_indexers)
            result['target_tokens'] = target_tokens_indexed

            if self._save_pgn_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)]
                result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long'))
                target_token_ids = source_and_target_token_ids[len(source_tokens):]
                result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long'))

            if self._save_copy_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)-2]
                result["source_token_ids"] = ArrayField(np.array(source_token_ids))
                target_token_ids = source_and_target_token_ids[len(source_tokens)-2:]
                result["target_token_ids"] = ArrayField(np.array(target_token_ids))

        elif self._save_copy_fields:
            source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        elif self._save_pgn_fields:
            source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        if self._save_copy_fields or self._save_pgn_fields:
            result["metadata"] = MetadataField(meta_fields)
        return Instance(result) 
Example #18
Source File: action_space_walker.py    From magnitude with MIT License 4 votes vote down vote up
def _walk(self)        :
        u"""
        Walk over action space to collect completed paths of at most ``self._max_path_length`` steps.
        """
        # Buffer of NTs to expand, previous actions
        incomplete_paths = [([unicode(type_)], ["{START_SYMBOL} -> {type_}"]) for type_ in
                            self._world.get_valid_starting_types()]

        self._completed_paths = []
        actions = self._world.get_valid_actions()
        # Overview: We keep track of the buffer of non-terminals to expand, and the action history
        # for each incomplete path. At every iteration in the while loop below, we iterate over all
        # incomplete paths, expand one non-terminal from the buffer in a depth-first fashion, get
        # all possible next actions triggered by that non-terminal and add to the paths. Then, we
        # check the expanded paths, to see if they are 1) complete, in which case they are
        # added to completed_paths, 2) longer than max_path_length, in which case they are
        # discarded, or 3) neither, in which case they are used to form the incomplete_paths for the
        # next iteration of this while loop.
        # While the non-terminal expansion is done in a depth-first fashion, note that the search over
        # the action space itself is breadth-first.
        while incomplete_paths:
            next_paths = []
            for nonterminal_buffer, history in incomplete_paths:
                # Taking the last non-terminal added to the buffer. We're going depth-first.
                nonterminal = nonterminal_buffer.pop()
                # Iterating over all possible next actions.
                for action in actions[nonterminal]:
                    new_history = history + [action]
                    new_nonterminal_buffer = nonterminal_buffer[:]
                    # Since we expand the last action added to the buffer, the left child should be
                    # added after the right child.
                    for right_side_part in reversed(self._get_right_side_parts(action)):
                        if types.is_nonterminal(right_side_part):
                            new_nonterminal_buffer.append(right_side_part)
                    next_paths.append((new_nonterminal_buffer, new_history))
            incomplete_paths = []
            for nonterminal_buffer, path in next_paths:
                # An empty buffer means that we've completed this path.
                if not nonterminal_buffer:
                    # Indexing completed paths by the nonterminals they contain.
                    next_path_index = len(self._completed_paths)
                    for action in path:
                        for value in self._get_right_side_parts(action):
                            if not types.is_nonterminal(value):
                                self._terminal_path_index[action].add(next_path_index)
                    self._completed_paths.append(path)
                # We're adding to incomplete_paths for the next iteration, only those paths that are
                # shorter than the max_path_length. The remaining paths will be discarded.
                elif len(path) <= self._max_path_length:
                    incomplete_paths.append((nonterminal_buffer, path))