Python rasa_nlu.training_data.TrainingData() Examples
The following are 30
code examples of rasa_nlu.training_data.TrainingData().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
rasa_nlu.training_data
, or try the search function
.
Example #1
Source File: loading.py From rasa_nlu with Apache License 2.0 | 6 votes |
def load_data_from_endpoint(data_endpoint: EndpointConfig, language: Optional[Text] = 'en') -> 'TrainingData': """Load training data from a URL.""" if not utils.is_url(data_endpoint.url): raise requests.exceptions.InvalidURL(data_endpoint.url) try: response = data_endpoint.request("get") response.raise_for_status() temp_data_file = utils.create_temporary_file(response.content, mode="w+b") training_data = _load(temp_data_file, language) return training_data except Exception as e: logger.warning("Could not retrieve training data " "from URL:\n{}".format(e))
Example #2
Source File: test_extractors.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def test_unintentional_synonyms_capitalized(component_builder): _config = utilities.base_test_conf("spacy_sklearn") ner_syn = component_builder.create_component("ner_synonyms", _config) examples = [ Message("Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}] }), Message("I want Tacos!", { "intent": "restaurant_search", "entities": [{"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}] }) ] ner_syn.train(TrainingData(training_examples=examples), _config) assert ner_syn.synonyms.get("mexican") is None assert ner_syn.synonyms.get("tacos") == "Mexican"
Example #3
Source File: test_extractors.py From rasa_nlu with Apache License 2.0 | 6 votes |
def test_unintentional_synonyms_capitalized(component_builder): _config = utilities.base_test_conf("pretrained_embeddings_spacy") ner_syn = component_builder.create_component(_config.for_component(5), _config) examples = [ Message("Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}] }), Message("I want Tacos!", { "intent": "restaurant_search", "entities": [{"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}] }) ] ner_syn.train(TrainingData(training_examples=examples), _config) assert ner_syn.synonyms.get("mexican") is None assert ner_syn.synonyms.get("tacos") == "Mexican"
Example #4
Source File: loading.py From rasa_nlu with Apache License 2.0 | 6 votes |
def load_data(resource_name: Text, language: Optional[Text] = 'en') -> 'TrainingData': """Load training data from disk. Merges them if loaded from disk and multiple files are found.""" from rasa_nlu.training_data import TrainingData files = utils.list_files(resource_name) data_sets = [_load(f, language) for f in files] data_sets = [ds for ds in data_sets if ds] if len(data_sets) == 0: training_data = TrainingData() elif len(data_sets) == 1: training_data = data_sets[0] else: training_data = data_sets[0].merge(*data_sets[1:]) return training_data
Example #5
Source File: test_featurizers.py From rasa_nlu with Apache License 2.0 | 6 votes |
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": 'char'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
Example #6
Source File: dialogflow.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def read(self, fn, **kwargs): # type: ([Text]) -> TrainingData """Loads training data stored in the Dialogflow data format.""" language = kwargs["language"] fformat = kwargs["fformat"] if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}: raise ValueError("fformat must be either {}, or {}".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES)) root_js = utils.read_json_file(fn) examples_js = self._read_examples_js(fn, language, fformat) if not examples_js: logger.warning("No training examples found for dialogflow file {}!".format(fn)) return TrainingData() elif fformat == DIALOGFLOW_INTENT: return self._read_intent(root_js, examples_js) elif fformat == DIALOGFLOW_ENTITIES: return self._read_entities(examples_js)
Example #7
Source File: test_featurizers.py From rasa_nlu with Apache License 2.0 | 6 votes |
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b', "OOV_token": '__oov__'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
Example #8
Source File: markdown.py From rasa_nlu with Apache License 2.0 | 6 votes |
def reads(self, s: Text, **kwargs: Any) -> 'TrainingData': """Read markdown string and create TrainingData object""" from rasa_nlu.training_data import TrainingData self.__init__() s = self._strip_comments(s) for line in s.splitlines(): line = line.strip() header = self._find_section_header(line) if header: self._set_current_section(header[0], header[1]) else: self._parse_item(line) self._load_files(line) return TrainingData(self.training_examples, self.entity_synonyms, self.regex_features, self.lookup_tables)
Example #9
Source File: crf_entity_extractor.py From rasa_nlu with Apache License 2.0 | 6 votes |
def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: # checks whether there is at least one # example with an entity annotation if training_data.entity_examples: self._check_spacy_doc(training_data.training_examples[0]) # filter out pre-trained entity examples filtered_entity_examples = self.filter_trainable_entities( training_data.training_examples) # convert the dataset into features # this will train on ALL examples, even the ones # without annotations dataset = self._create_dataset(filtered_entity_examples) self._train_model(dataset)
Example #10
Source File: sklearn_intent_classifier.py From rasa_nlu with Apache License 2.0 | 6 votes |
def train(self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any) -> None: """Train the intent classifier on a data set.""" num_threads = kwargs.get("num_threads", 1) labels = [e.get("intent") for e in training_data.intent_examples] if len(set(labels)) < 2: logger.warning("Can not train an intent classifier. " "Need at least 2 different classes. " "Skipping training of intent classifier.") else: y = self.transform_labels_str2num(labels) X = np.stack([example.get("text_features") for example in training_data.intent_examples]) self.clf = self._create_classifier(num_threads, y) self.clf.fit(X, y)
Example #11
Source File: mitie_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def train(self, training_data, cfg, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None import mitie model_file = kwargs.get("mitie_file") if not model_file: raise Exception("Can not run MITIE entity extractor without a " "language model. Make sure this component is " "preceeded by the 'nlp_mitie' component.") trainer = mitie.text_categorizer_trainer(model_file) trainer.num_threads = kwargs.get("num_threads", 1) for example in training_data.intent_examples: tokens = self._tokens_of_message(example) trainer.add_labeled_text(tokens, example.get("intent")) if training_data.intent_examples: # we can not call train if there are no examples! self.clf = trainer.train()
Example #12
Source File: embedding_intent_classifier.py From rasa_nlu with Apache License 2.0 | 6 votes |
def _prepare_data_for_training( self, training_data: 'TrainingData', intent_dict: Dict[Text, int] ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Prepare data for training""" X = np.stack([e.get("text_features") for e in training_data.intent_examples]) intents_for_X = np.array([intent_dict[e.get("intent")] for e in training_data.intent_examples]) Y = np.stack([self.encoded_all_intents[intent_idx] for intent_idx in intents_for_X]) return X, Y, intents_for_X # tf helpers:
Example #13
Source File: sklearn_intent_classifier.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def train(self, training_data, cfg, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None """Train the intent classifier on a data set.""" num_threads = kwargs.get("num_threads", 1) labels = [e.get("intent") for e in training_data.intent_examples] if len(set(labels)) < 2: logger.warn("Can not train an intent classifier. " "Need at least 2 different classes. " "Skipping training of intent classifier.") else: y = self.transform_labels_str2num(labels) X = np.stack([example.get("text_features") for example in training_data.intent_examples]) self.clf = self._create_classifier(num_threads, y) self.clf.fit(X, y)
Example #14
Source File: model.py From Rasa_NLU_Chi with Apache License 2.0 | 6 votes |
def __init__(self, cfg, # type: RasaNLUModelConfig component_builder=None, # type: Optional[ComponentBuilder] skip_validation=False # type: bool ): # type: (...) -> None self.config = cfg self.skip_validation = skip_validation self.training_data = None # type: Optional[TrainingData] if component_builder is None: # If no builder is passed, every interpreter creation will result in # a new builder. hence, no components are reused. component_builder = components.ComponentBuilder() # Before instantiating the component classes, lets check if all # required packages are available if not self.skip_validation: components.validate_requirements(cfg.component_names) # build pipeline self.pipeline = self._build_pipeline(cfg, component_builder)
Example #15
Source File: dialogflow.py From rasa_nlu with Apache License 2.0 | 6 votes |
def read(self, fn: Text, **kwargs: Any) -> 'TrainingData': """Loads training data stored in the Dialogflow data format.""" from rasa_nlu.training_data import TrainingData language = kwargs["language"] fformat = kwargs["fformat"] if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}: raise ValueError("fformat must be either {}, or {}" "".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES)) root_js = utils.read_json_file(fn) examples_js = self._read_examples_js(fn, language, fformat) if not examples_js: logger.warning("No training examples found for dialogflow file {}!" "".format(fn)) return TrainingData() elif fformat == DIALOGFLOW_INTENT: return self._read_intent(root_js, examples_js) elif fformat == DIALOGFLOW_ENTITIES: return self._read_entities(root_js, examples_js)
Example #16
Source File: spacy_featurizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData) -> None for example in training_data.intent_examples: self._set_spacy_features(example)
Example #17
Source File: mitie_entity_extractor.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig) -> None import mitie model_file = kwargs.get("mitie_file") if not model_file: raise Exception("Can not run MITIE entity extractor without a " "language model. Make sure this component is " "preceeded by the 'nlp_mitie' component.") trainer = mitie.ner_trainer(model_file) trainer.num_threads = kwargs.get("num_threads", 1) found_one_entity = False # filter out pre-trained entity examples filtered_entity_examples = self.filter_trainable_entities( training_data.training_examples) for example in filtered_entity_examples: sample = self._prepare_mitie_sample(example) found_one_entity = sample.num_entities > 0 or found_one_entity trainer.add(sample) # Mitie will fail to train if there is not a single entity tagged if found_one_entity: self.ner = trainer.train()
Example #18
Source File: count_vectors_featurizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, cfg=None, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None """Take parameters from config and construct a new count vectorizer using the sklearn framework.""" from sklearn.feature_extraction.text import CountVectorizer # use even single character word as a token self.vect = CountVectorizer(token_pattern=self.token_pattern, strip_accents=self.strip_accents, stop_words=self.stop_words, ngram_range=(self.min_ngram, self.max_ngram), max_df=self.max_df, min_df=self.min_df, max_features=self.max_features, preprocessor=self.preprocessor) lem_exs = [self._lemmatize(example) for example in training_data.intent_examples] try: X = self.vect.fit_transform(lem_exs).toarray() except ValueError: self.vect = None return for i, example in enumerate(training_data.intent_examples): # create bag for each example example.set("text_features", X[i])
Example #19
Source File: ngram_featurizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, cfg, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None start = time.time() self.train_on_sentences(training_data.intent_examples) logger.debug("Ngram collection took {} seconds" "".format(time.time() - start)) for example in training_data.training_examples: updated = self._text_features_with_ngrams(example, self.best_num_ngrams) example.set("text_features", updated)
Example #20
Source File: regex_featurizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.regex_features: self.known_patterns.append(example) for example in training_data.training_examples: updated = self._text_features_with_regex(example) example.set("text_features", updated)
Example #21
Source File: test_featurizers.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) message = Message(sentence) message.set("intent", "bla") data = TrainingData([message]) ftr.train(data) ftr.process(message) assert np.all(message.get("text_features")[0] == expected)
Example #22
Source File: test_train.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) trainer.train(TrainingData()) persistor = create_persistor(_config) persisted_path = trainer.persist(tmpdir.strpath, persistor, project_name="my_project") loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
Example #23
Source File: jieba_tokenizer.py From rasa_bot with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #24
Source File: model.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, data, **kwargs): # type: (TrainingData) -> Interpreter """Trains the underlying pipeline using the provided training data.""" self.training_data = data context = kwargs # type: Dict[Text, Any] for component in self.pipeline: updates = component.provide_context() if updates: context.update(updates) # Before the training starts: check that all arguments are provided if not self.skip_validation: components.validate_arguments(self.pipeline, context) # data gets modified internally during the training - hence the copy working_data = copy.deepcopy(data) for i, component in enumerate(self.pipeline): logger.info("Starting to train component {}" "".format(component.name)) component.prepare_partial_processing(self.pipeline[:i], context) updates = component.train(working_data, self.config, **context) logger.info("Finished training component.") if updates: context.update(updates) return Interpreter(self.pipeline, context)
Example #25
Source File: whitespace_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #26
Source File: spacy_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.training_examples: example.set("tokens", self.tokenize(example.get("spacy_doc")))
Example #27
Source File: yaha_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig, **Any) -> None if config['language'] != 'zh': raise Exception("tokenizer_yaha is only used for Chinese. Check your configure json file.") for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #28
Source File: jieba_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #29
Source File: mitie_tokenizer.py From Rasa_NLU_Chi with Apache License 2.0 | 5 votes |
def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None for example in training_data.training_examples: example.set("tokens", self.tokenize(example.text))
Example #30
Source File: test_featurizers.py From rasa_nlu with Apache License 2.0 | 5 votes |
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)