Python rasa_nlu.training_data.TrainingData() Examples

The following are 30 code examples of rasa_nlu.training_data.TrainingData(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rasa_nlu.training_data , or try the search function .
Example #1
Source File: loading.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def load_data_from_endpoint(data_endpoint: EndpointConfig,
                            language: Optional[Text] = 'en') -> 'TrainingData':
    """Load training data from a URL."""

    if not utils.is_url(data_endpoint.url):
        raise requests.exceptions.InvalidURL(data_endpoint.url)
    try:
        response = data_endpoint.request("get")
        response.raise_for_status()
        temp_data_file = utils.create_temporary_file(response.content,
                                                     mode="w+b")
        training_data = _load(temp_data_file, language)

        return training_data
    except Exception as e:
        logger.warning("Could not retrieve training data "
                       "from URL:\n{}".format(e)) 
Example #2
Source File: test_extractors.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("spacy_sklearn")
    ner_syn = component_builder.create_component("ner_synonyms", _config)
    examples = [
        Message("Any Mexican restaurant will do", {
            "intent": "restaurant_search",
            "entities": [{"start": 4,
                          "end": 11,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        }),
        Message("I want Tacos!", {
            "intent": "restaurant_search",
            "entities": [{"start": 7,
                          "end": 12,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican" 
Example #3
Source File: test_extractors.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def test_unintentional_synonyms_capitalized(component_builder):

    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
    ner_syn = component_builder.create_component(_config.for_component(5),
                                                 _config)
    examples = [
        Message("Any Mexican restaurant will do", {
            "intent": "restaurant_search",
            "entities": [{"start": 4,
                          "end": 11,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        }),
        Message("I want Tacos!", {
            "intent": "restaurant_search",
            "entities": [{"start": 7,
                          "end": 12,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican" 
Example #4
Source File: loading.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def load_data(resource_name: Text,
              language: Optional[Text] = 'en') -> 'TrainingData':
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""
    from rasa_nlu.training_data import TrainingData

    files = utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        training_data = TrainingData()
    elif len(data_sets) == 1:
        training_data = data_sets[0]
    else:
        training_data = data_sets[0].merge(*data_sets[1:])

    return training_data 
Example #5
Source File: test_featurizers.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1,
                                  "max_ngram": 2,
                                  "analyzer": 'char'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected) 
Example #6
Source File: dialogflow.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def read(self, fn, **kwargs):
        # type: ([Text]) -> TrainingData
        """Loads training data stored in the Dialogflow data format."""

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError("fformat must be either {}, or {}".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES))

        root_js = utils.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            logger.warning("No training examples found for dialogflow file {}!".format(fn))
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        elif fformat == DIALOGFLOW_ENTITIES:
            return self._read_entities(examples_js) 
Example #7
Source File: test_featurizers.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b',
                                  "OOV_token": '__oov__'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected) 
Example #8
Source File: markdown.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def reads(self, s: Text, **kwargs: Any) -> 'TrainingData':
        """Read markdown string and create TrainingData object"""
        from rasa_nlu.training_data import TrainingData

        self.__init__()
        s = self._strip_comments(s)
        for line in s.splitlines():
            line = line.strip()
            header = self._find_section_header(line)
            if header:
                self._set_current_section(header[0], header[1])
            else:
                self._parse_item(line)
                self._load_files(line)
        return TrainingData(self.training_examples, self.entity_synonyms,
                            self.regex_features, self.lookup_tables) 
Example #9
Source File: crf_entity_extractor.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:

        # checks whether there is at least one
        # example with an entity annotation
        if training_data.entity_examples:
            self._check_spacy_doc(training_data.training_examples[0])

            # filter out pre-trained entity examples
            filtered_entity_examples = self.filter_trainable_entities(
                training_data.training_examples)

            # convert the dataset into features
            # this will train on ALL examples, even the ones
            # without annotations
            dataset = self._create_dataset(filtered_entity_examples)

            self._train_model(dataset) 
Example #10
Source File: sklearn_intent_classifier.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def train(self,
              training_data: TrainingData,
              cfg: RasaNLUModelConfig,
              **kwargs: Any) -> None:
        """Train the intent classifier on a data set."""

        num_threads = kwargs.get("num_threads", 1)

        labels = [e.get("intent")
                  for e in training_data.intent_examples]

        if len(set(labels)) < 2:
            logger.warning("Can not train an intent classifier. "
                           "Need at least 2 different classes. "
                           "Skipping training of intent classifier.")
        else:
            y = self.transform_labels_str2num(labels)
            X = np.stack([example.get("text_features")
                          for example in training_data.intent_examples])

            self.clf = self._create_classifier(num_threads, y)

            self.clf.fit(X, y) 
Example #11
Source File: mitie_intent_classifier.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        import mitie

        model_file = kwargs.get("mitie_file")
        if not model_file:
            raise Exception("Can not run MITIE entity extractor without a "
                            "language model. Make sure this component is "
                            "preceeded by the 'nlp_mitie' component.")

        trainer = mitie.text_categorizer_trainer(model_file)
        trainer.num_threads = kwargs.get("num_threads", 1)

        for example in training_data.intent_examples:
            tokens = self._tokens_of_message(example)
            trainer.add_labeled_text(tokens, example.get("intent"))

        if training_data.intent_examples:
            # we can not call train if there are no examples!
            self.clf = trainer.train() 
Example #12
Source File: embedding_intent_classifier.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def _prepare_data_for_training(
            self,
            training_data: 'TrainingData',
            intent_dict: Dict[Text, int]
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Prepare data for training"""

        X = np.stack([e.get("text_features")
                      for e in training_data.intent_examples])

        intents_for_X = np.array([intent_dict[e.get("intent")]
                                  for e in training_data.intent_examples])

        Y = np.stack([self.encoded_all_intents[intent_idx]
                      for intent_idx in intents_for_X])

        return X, Y, intents_for_X

    # tf helpers: 
Example #13
Source File: sklearn_intent_classifier.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        """Train the intent classifier on a data set."""

        num_threads = kwargs.get("num_threads", 1)

        labels = [e.get("intent")
                  for e in training_data.intent_examples]

        if len(set(labels)) < 2:
            logger.warn("Can not train an intent classifier. "
                        "Need at least 2 different classes. "
                        "Skipping training of intent classifier.")
        else:
            y = self.transform_labels_str2num(labels)
            X = np.stack([example.get("text_features")
                          for example in training_data.intent_examples])

            self.clf = self._create_classifier(num_threads, y)

            self.clf.fit(X, y) 
Example #14
Source File: model.py    From Rasa_NLU_Chi with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 cfg,  # type: RasaNLUModelConfig
                 component_builder=None,  # type: Optional[ComponentBuilder]
                 skip_validation=False  # type: bool
                 ):
        # type: (...) -> None

        self.config = cfg
        self.skip_validation = skip_validation
        self.training_data = None  # type: Optional[TrainingData]

        if component_builder is None:
            # If no builder is passed, every interpreter creation will result in
            # a new builder. hence, no components are reused.
            component_builder = components.ComponentBuilder()

        # Before instantiating the component classes, lets check if all
        # required packages are available
        if not self.skip_validation:
            components.validate_requirements(cfg.component_names)

        # build pipeline
        self.pipeline = self._build_pipeline(cfg, component_builder) 
Example #15
Source File: dialogflow.py    From rasa_nlu with Apache License 2.0 6 votes vote down vote up
def read(self, fn: Text, **kwargs: Any) -> 'TrainingData':
        """Loads training data stored in the Dialogflow data format."""
        from rasa_nlu.training_data import TrainingData

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError("fformat must be either {}, or {}"
                             "".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES))

        root_js = utils.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            logger.warning("No training examples found for dialogflow file {}!"
                           "".format(fn))
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        elif fformat == DIALOGFLOW_ENTITIES:
            return self._read_entities(root_js, examples_js) 
Example #16
Source File: spacy_featurizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData) -> None

        for example in training_data.intent_examples:
            self._set_spacy_features(example) 
Example #17
Source File: mitie_entity_extractor.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig) -> None
        import mitie

        model_file = kwargs.get("mitie_file")
        if not model_file:
            raise Exception("Can not run MITIE entity extractor without a "
                            "language model. Make sure this component is "
                            "preceeded by the 'nlp_mitie' component.")

        trainer = mitie.ner_trainer(model_file)
        trainer.num_threads = kwargs.get("num_threads", 1)
        found_one_entity = False

        # filter out pre-trained entity examples
        filtered_entity_examples = self.filter_trainable_entities(
                training_data.training_examples)

        for example in filtered_entity_examples:
            sample = self._prepare_mitie_sample(example)

            found_one_entity = sample.num_entities > 0 or found_one_entity
            trainer.add(sample)

        # Mitie will fail to train if there is not a single entity tagged
        if found_one_entity:
            self.ner = trainer.train() 
Example #18
Source File: count_vectors_featurizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, cfg=None, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        """Take parameters from config and
            construct a new count vectorizer using the sklearn framework."""
        from sklearn.feature_extraction.text import CountVectorizer

        # use even single character word as a token
        self.vect = CountVectorizer(token_pattern=self.token_pattern,
                                    strip_accents=self.strip_accents,
                                    stop_words=self.stop_words,
                                    ngram_range=(self.min_ngram,
                                                 self.max_ngram),
                                    max_df=self.max_df,
                                    min_df=self.min_df,
                                    max_features=self.max_features,
                                    preprocessor=self.preprocessor)

        lem_exs = [self._lemmatize(example)
                   for example in training_data.intent_examples]

        try:
            X = self.vect.fit_transform(lem_exs).toarray()
        except ValueError:
            self.vect = None
            return

        for i, example in enumerate(training_data.intent_examples):
            # create bag for each example
            example.set("text_features", X[i]) 
Example #19
Source File: ngram_featurizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        start = time.time()
        self.train_on_sentences(training_data.intent_examples)
        logger.debug("Ngram collection took {} seconds"
                     "".format(time.time() - start))

        for example in training_data.training_examples:
            updated = self._text_features_with_ngrams(example,
                                                      self.best_num_ngrams)
            example.set("text_features", updated) 
Example #20
Source File: regex_featurizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.regex_features:
            self.known_patterns.append(example)

        for example in training_data.training_examples:
            updated = self._text_features_with_regex(example)
            example.set("text_features", updated) 
Example #21
Source File: test_featurizers.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected) 
Example #22
Source File: test_train.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
    trainer = Trainer(_config, component_builder)
    trainer.train(TrainingData())
    persistor = create_persistor(_config)
    persisted_path = trainer.persist(tmpdir.strpath, persistor,
                                     project_name="my_project")
    loaded = Interpreter.load(persisted_path, component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello") is not None
    assert loaded.parse("Hello today is Monday, again!") is not None 
Example #23
Source File: jieba_tokenizer.py    From rasa_bot with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #24
Source File: model.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, data, **kwargs):
        # type: (TrainingData) -> Interpreter
        """Trains the underlying pipeline using the provided training data."""

        self.training_data = data

        context = kwargs  # type: Dict[Text, Any]

        for component in self.pipeline:
            updates = component.provide_context()
            if updates:
                context.update(updates)

        # Before the training starts: check that all arguments are provided
        if not self.skip_validation:
            components.validate_arguments(self.pipeline, context)

        # data gets modified internally during the training - hence the copy
        working_data = copy.deepcopy(data)

        for i, component in enumerate(self.pipeline):
            logger.info("Starting to train component {}"
                        "".format(component.name))
            component.prepare_partial_processing(self.pipeline[:i], context)
            updates = component.train(working_data, self.config,
                                      **context)
            logger.info("Finished training component.")
            if updates:
                context.update(updates)

        return Interpreter(self.pipeline, context) 
Example #25
Source File: whitespace_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #26
Source File: spacy_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.get("spacy_doc"))) 
Example #27
Source File: yaha_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        if config['language'] != 'zh':
            raise Exception("tokenizer_yaha is only used for Chinese. Check your configure json file.")
            
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #28
Source File: jieba_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
            
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #29
Source File: mitie_tokenizer.py    From Rasa_NLU_Chi with Apache License 2.0 5 votes vote down vote up
def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text)) 
Example #30
Source File: test_featurizers.py    From rasa_nlu with Apache License 2.0 5 votes vote down vote up
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)