Python Examples of rasa_nlu.training

Source File: loading.py From rasa_nlu with Apache License 2.0

6 votes

def load_data_from_endpoint(data_endpoint: EndpointConfig,
                            language: Optional[Text] = 'en') -> 'TrainingData':
    """Load training data from a URL."""

    if not utils.is_url(data_endpoint.url):
        raise requests.exceptions.InvalidURL(data_endpoint.url)
    try:
        response = data_endpoint.request("get")
        response.raise_for_status()
        temp_data_file = utils.create_temporary_file(response.content,
                                                     mode="w+b")
        training_data = _load(temp_data_file, language)

        return training_data
    except Exception as e:
        logger.warning("Could not retrieve training data "
                       "from URL:\n{}".format(e))

Source File: test_extractors.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("spacy_sklearn")
    ner_syn = component_builder.create_component("ner_synonyms", _config)
    examples = [
        Message("Any Mexican restaurant will do", {
            "intent": "restaurant_search",
            "entities": [{"start": 4,
                          "end": 11,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        }),
        Message("I want Tacos!", {
            "intent": "restaurant_search",
            "entities": [{"start": 7,
                          "end": 12,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"

Source File: test_extractors.py From rasa_nlu with Apache License 2.0

6 votes

def test_unintentional_synonyms_capitalized(component_builder):

    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
    ner_syn = component_builder.create_component(_config.for_component(5),
                                                 _config)
    examples = [
        Message("Any Mexican restaurant will do", {
            "intent": "restaurant_search",
            "entities": [{"start": 4,
                          "end": 11,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        }),
        Message("I want Tacos!", {
            "intent": "restaurant_search",
            "entities": [{"start": 7,
                          "end": 12,
                          "value": "Mexican",
                          "entity": "cuisine"}]
        })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"

Source File: loading.py From rasa_nlu with Apache License 2.0

6 votes

def load_data(resource_name: Text,
              language: Optional[Text] = 'en') -> 'TrainingData':
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""
    from rasa_nlu.training_data import TrainingData

    files = utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        training_data = TrainingData()
    elif len(data_sets) == 1:
        training_data = data_sets[0]
    else:
        training_data = data_sets[0].merge(*data_sets[1:])

    return training_data

Source File: test_featurizers.py From rasa_nlu with Apache License 2.0

6 votes

def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1,
                                  "max_ngram": 2,
                                  "analyzer": 'char'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)

Source File: dialogflow.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def read(self, fn, **kwargs):
        # type: ([Text]) -> TrainingData
        """Loads training data stored in the Dialogflow data format."""

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError("fformat must be either {}, or {}".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES))

        root_js = utils.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            logger.warning("No training examples found for dialogflow file {}!".format(fn))
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        elif fformat == DIALOGFLOW_ENTITIES:
            return self._read_entities(examples_js)

Source File: test_featurizers.py From rasa_nlu with Apache License 2.0

6 votes

def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b',
                                  "OOV_token": '__oov__'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)

Source File: markdown.py From rasa_nlu with Apache License 2.0

6 votes

def reads(self, s: Text, **kwargs: Any) -> 'TrainingData':
        """Read markdown string and create TrainingData object"""
        from rasa_nlu.training_data import TrainingData

        self.__init__()
        s = self._strip_comments(s)
        for line in s.splitlines():
            line = line.strip()
            header = self._find_section_header(line)
            if header:
                self._set_current_section(header[0], header[1])
            else:
                self._parse_item(line)
                self._load_files(line)
        return TrainingData(self.training_examples, self.entity_synonyms,
                            self.regex_features, self.lookup_tables)

Source File: crf_entity_extractor.py From rasa_nlu with Apache License 2.0

6 votes

def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:

        # checks whether there is at least one
        # example with an entity annotation
        if training_data.entity_examples:
            self._check_spacy_doc(training_data.training_examples[0])

            # filter out pre-trained entity examples
            filtered_entity_examples = self.filter_trainable_entities(
                training_data.training_examples)

            # convert the dataset into features
            # this will train on ALL examples, even the ones
            # without annotations
            dataset = self._create_dataset(filtered_entity_examples)

            self._train_model(dataset)

Source File: sklearn_intent_classifier.py From rasa_nlu with Apache License 2.0

6 votes

def train(self,
              training_data: TrainingData,
              cfg: RasaNLUModelConfig,
              **kwargs: Any) -> None:
        """Train the intent classifier on a data set."""

        num_threads = kwargs.get("num_threads", 1)

        labels = [e.get("intent")
                  for e in training_data.intent_examples]

        if len(set(labels)) < 2:
            logger.warning("Can not train an intent classifier. "
                           "Need at least 2 different classes. "
                           "Skipping training of intent classifier.")
        else:
            y = self.transform_labels_str2num(labels)
            X = np.stack([example.get("text_features")
                          for example in training_data.intent_examples])

            self.clf = self._create_classifier(num_threads, y)

            self.clf.fit(X, y)

Source File: mitie_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        import mitie

        model_file = kwargs.get("mitie_file")
        if not model_file:
            raise Exception("Can not run MITIE entity extractor without a "
                            "language model. Make sure this component is "
                            "preceeded by the 'nlp_mitie' component.")

        trainer = mitie.text_categorizer_trainer(model_file)
        trainer.num_threads = kwargs.get("num_threads", 1)

        for example in training_data.intent_examples:
            tokens = self._tokens_of_message(example)
            trainer.add_labeled_text(tokens, example.get("intent"))

        if training_data.intent_examples:
            # we can not call train if there are no examples!
            self.clf = trainer.train()

Source File: embedding_intent_classifier.py From rasa_nlu with Apache License 2.0

6 votes

def _prepare_data_for_training(
            self,
            training_data: 'TrainingData',
            intent_dict: Dict[Text, int]
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """Prepare data for training"""

        X = np.stack([e.get("text_features")
                      for e in training_data.intent_examples])

        intents_for_X = np.array([intent_dict[e.get("intent")]
                                  for e in training_data.intent_examples])

        Y = np.stack([self.encoded_all_intents[intent_idx]
                      for intent_idx in intents_for_X])

        return X, Y, intents_for_X

    # tf helpers:

Source File: sklearn_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        """Train the intent classifier on a data set."""

        num_threads = kwargs.get("num_threads", 1)

        labels = [e.get("intent")
                  for e in training_data.intent_examples]

        if len(set(labels)) < 2:
            logger.warn("Can not train an intent classifier. "
                        "Need at least 2 different classes. "
                        "Skipping training of intent classifier.")
        else:
            y = self.transform_labels_str2num(labels)
            X = np.stack([example.get("text_features")
                          for example in training_data.intent_examples])

            self.clf = self._create_classifier(num_threads, y)

            self.clf.fit(X, y)

Source File: model.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def __init__(self,
                 cfg,  # type: RasaNLUModelConfig
                 component_builder=None,  # type: Optional[ComponentBuilder]
                 skip_validation=False  # type: bool
                 ):
        # type: (...) -> None

        self.config = cfg
        self.skip_validation = skip_validation
        self.training_data = None  # type: Optional[TrainingData]

        if component_builder is None:
            # If no builder is passed, every interpreter creation will result in
            # a new builder. hence, no components are reused.
            component_builder = components.ComponentBuilder()

        # Before instantiating the component classes, lets check if all
        # required packages are available
        if not self.skip_validation:
            components.validate_requirements(cfg.component_names)

        # build pipeline
        self.pipeline = self._build_pipeline(cfg, component_builder)

Source File: dialogflow.py From rasa_nlu with Apache License 2.0

6 votes

def read(self, fn: Text, **kwargs: Any) -> 'TrainingData':
        """Loads training data stored in the Dialogflow data format."""
        from rasa_nlu.training_data import TrainingData

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError("fformat must be either {}, or {}"
                             "".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES))

        root_js = utils.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            logger.warning("No training examples found for dialogflow file {}!"
                           "".format(fn))
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        elif fformat == DIALOGFLOW_ENTITIES:
            return self._read_entities(root_js, examples_js)

Source File: spacy_featurizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData) -> None

        for example in training_data.intent_examples:
            self._set_spacy_features(example)

Source File: mitie_entity_extractor.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig) -> None
        import mitie

        model_file = kwargs.get("mitie_file")
        if not model_file:
            raise Exception("Can not run MITIE entity extractor without a "
                            "language model. Make sure this component is "
                            "preceeded by the 'nlp_mitie' component.")

        trainer = mitie.ner_trainer(model_file)
        trainer.num_threads = kwargs.get("num_threads", 1)
        found_one_entity = False

        # filter out pre-trained entity examples
        filtered_entity_examples = self.filter_trainable_entities(
                training_data.training_examples)

        for example in filtered_entity_examples:
            sample = self._prepare_mitie_sample(example)

            found_one_entity = sample.num_entities > 0 or found_one_entity
            trainer.add(sample)

        # Mitie will fail to train if there is not a single entity tagged
        if found_one_entity:
            self.ner = trainer.train()

Source File: count_vectors_featurizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, cfg=None, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        """Take parameters from config and
            construct a new count vectorizer using the sklearn framework."""
        from sklearn.feature_extraction.text import CountVectorizer

        # use even single character word as a token
        self.vect = CountVectorizer(token_pattern=self.token_pattern,
                                    strip_accents=self.strip_accents,
                                    stop_words=self.stop_words,
                                    ngram_range=(self.min_ngram,
                                                 self.max_ngram),
                                    max_df=self.max_df,
                                    min_df=self.min_df,
                                    max_features=self.max_features,
                                    preprocessor=self.preprocessor)

        lem_exs = [self._lemmatize(example)
                   for example in training_data.intent_examples]

        try:
            X = self.vect.fit_transform(lem_exs).toarray()
        except ValueError:
            self.vect = None
            return

        for i, example in enumerate(training_data.intent_examples):
            # create bag for each example
            example.set("text_features", X[i])

Source File: ngram_featurizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, cfg, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        start = time.time()
        self.train_on_sentences(training_data.intent_examples)
        logger.debug("Ngram collection took {} seconds"
                     "".format(time.time() - start))

        for example in training_data.training_examples:
            updated = self._text_features_with_ngrams(example,
                                                      self.best_num_ngrams)
            example.set("text_features", updated)

Source File: regex_featurizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.regex_features:
            self.known_patterns.append(example)

        for example in training_data.training_examples:
            updated = self._text_features_with_regex(example)
            example.set("text_features", updated)

Source File: test_featurizers.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)

Source File: test_train.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
    trainer = Trainer(_config, component_builder)
    trainer.train(TrainingData())
    persistor = create_persistor(_config)
    persisted_path = trainer.persist(tmpdir.strpath, persistor,
                                     project_name="my_project")
    loaded = Interpreter.load(persisted_path, component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello") is not None
    assert loaded.parse("Hello today is Monday, again!") is not None

Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: model.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, data, **kwargs):
        # type: (TrainingData) -> Interpreter
        """Trains the underlying pipeline using the provided training data."""

        self.training_data = data

        context = kwargs  # type: Dict[Text, Any]

        for component in self.pipeline:
            updates = component.provide_context()
            if updates:
                context.update(updates)

        # Before the training starts: check that all arguments are provided
        if not self.skip_validation:
            components.validate_arguments(self.pipeline, context)

        # data gets modified internally during the training - hence the copy
        working_data = copy.deepcopy(data)

        for i, component in enumerate(self.pipeline):
            logger.info("Starting to train component {}"
                        "".format(component.name))
            component.prepare_partial_processing(self.pipeline[:i], context)
            updates = component.train(working_data, self.config,
                                      **context)
            logger.info("Finished training component.")
            if updates:
                context.update(updates)

        return Interpreter(self.pipeline, context)

Source File: whitespace_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: spacy_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.get("spacy_doc")))

Source File: yaha_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUConfig, **Any) -> None
        if config['language'] != 'zh':
            raise Exception("tokenizer_yaha is only used for Chinese. Check your configure json file.")
            
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: jieba_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None
            
        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: mitie_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def train(self, training_data, config, **kwargs):
        # type: (TrainingData, RasaNLUModelConfig, **Any) -> None

        for example in training_data.training_examples:
            example.set("tokens", self.tokenize(example.text))

Source File: test_featurizers.py From rasa_nlu with Apache License 2.0

5 votes

def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)

Python rasa_nlu.training_data.TrainingData() Examples