Python Examples of rasa_nlu.training_data.load

Source File: bot.py From rasa_core with Apache License 2.0

7 votes

def train_nlu():
    from rasa_nlu.training_data import load_data
    from rasa_nlu import config
    from rasa_nlu.model import Trainer

    training_data = load_data('data/nlu.md')
    trainer = Trainer(config.load("config.yml"))
    trainer.train(training_data)
    model_directory = trainer.persist('models/nlu/',
                                      fixed_model_name="current")

    return model_directory

Source File: bot.py From rasa_core with Apache License 2.0

6 votes

def train_dialogue(domain_file="domain.yml",
                         model_path="models/dialogue",
                         training_data_file="data/stories.md"):
    agent = Agent(domain_file,
                  policies=[MemoizationPolicy(max_history=3),
                            MappingPolicy(),
                            RestaurantPolicy(batch_size=100, epochs=400,
                                             validation_split=0.2)])

    training_data = await agent.load_data(training_data_file)
    agent.train(
        training_data
    )

    agent.persist(model_path)
    return agent

Source File: trainer.py From weather-bot with MIT License

6 votes

def train_dialogue(
        domain_file="domain.yml",
        model_path="models/dialogue",
        training_data_file="data/stories.md"
        ):
    agent = Agent(
        domain_file,
        policies=[MemoizationPolicy(max_history=3), KerasPolicy()]
        )
    training_data = agent.load_data(training_data_file)
    agent.train(
        training_data,
        epochs=400,
        batch_size=100,
        validation_split=0.2
        )
    agent.persist(model_path)
    return agent

Source File: test_featurizers.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def test_spacy_featurizer_casing(spacy_nlp):
    from rasa_nlu.featurizers import spacy_featurizer

    # if this starts failing for the default model, we should think about
    # removing the lower casing the spacy nlp component does when it
    # retrieves vectors. For compressed spacy models (e.g. models
    # ending in _sm) this test will most likely fail.

    td = training_data.load_data('data/examples/rasa/demo-rasa.json')
    for e in td.intent_examples:
        doc = spacy_nlp(e.text)
        doc_capitalized = spacy_nlp(e.text.capitalize())

        vecs = spacy_featurizer.features_for_doc(doc)
        vecs_capitalized = spacy_featurizer.features_for_doc(doc_capitalized)

        assert np.allclose(vecs, vecs_capitalized, atol=1e-5), \
            "Vectors are unequal for texts '{}' and '{}'".format(
                    e.text, e.text.capitalize())

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def test_demo_data(filename):
    td = training_data.load_data(filename)
    assert td.intents == {"affirm", "greet", "restaurant_search", "goodbye"}
    assert td.entities == {"location", "cuisine"}
    assert len(td.training_examples) == 42
    assert len(td.intent_examples) == 42
    assert len(td.entity_examples) == 11

    assert td.entity_synonyms == {'Chines': 'chinese',
                                  'Chinese': 'chinese',
                                  'chines': 'chinese',
                                  'vegg': 'vegetarian',
                                  'veggie': 'vegetarian'}

    assert td.regex_features == [{"name": "greet", "pattern": "hey[^\s]*"},
                                 {"name": "zipcode", "pattern": "[0-9]{5}"}]

Source File: test_multitenancy.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def train_models(component_builder, data):
    # Retrain different multitenancy models
    def train(cfg_name, project_name):
        from rasa_nlu.train import create_persistor
        from rasa_nlu import training_data

        cfg = config.load(cfg_name)
        trainer = Trainer(cfg, component_builder)
        training_data = training_data.load_data(data)

        trainer.train(training_data)
        trainer.persist("test_projects", project_name=project_name)

    train("sample_configs/config_spacy.yml", "test_project_spacy_sklearn")
    train("sample_configs/config_mitie.yml", "test_project_mitie")
    train("sample_configs/config_mitie_sklearn.yml", "test_project_mitie_sklearn")

Source File: test_interpreter.py From Rasa_NLU_Chi with Apache License 2.0

6 votes

def test_interpreter(pipeline_template, component_builder, tmpdir):
    test_data = "data/examples/rasa/demo-rasa.json"
    _conf = utilities.base_test_conf(pipeline_template)
    _conf["data"] = test_data
    td = training_data.load_data(test_data)
    interpreter = utilities.interpreter_for(component_builder,
                                            "data/examples/rasa/demo-rasa.json",
                                            tmpdir.strpath,
                                            _conf)

    texts = ["good bye", "i am looking for an indian spot"]

    for text in texts:
        result = interpreter.parse(text, time=None)
        assert result['text'] == text
        assert (not result['intent']['name']
                or result['intent']['name'] in td.intents)
        assert result['intent']['confidence'] >= 0
        # Ensure the model doesn't detect entity types that are not present
        # Models on our test data set are not stable enough to
        # require the exact entities to be found
        for entity in result['entities']:
            assert entity['entity'] in td.entities

Source File: test_featurizers.py From rasa_nlu with Apache License 2.0

6 votes

def test_spacy_featurizer_casing(spacy_nlp):
    from rasa_nlu.featurizers import spacy_featurizer

    # if this starts failing for the default model, we should think about
    # removing the lower casing the spacy nlp component does when it
    # retrieves vectors. For compressed spacy models (e.g. models
    # ending in _sm) this test will most likely fail.

    td = training_data.load_data('data/examples/rasa/demo-rasa.json')
    for e in td.intent_examples:
        doc = spacy_nlp(e.text)
        doc_capitalized = spacy_nlp(e.text.capitalize())

        vecs = spacy_featurizer.features_for_doc(doc)
        vecs_capitalized = spacy_featurizer.features_for_doc(doc_capitalized)

        assert np.allclose(vecs, vecs_capitalized, atol=1e-5), \
            "Vectors are unequal for texts '{}' and '{}'".format(
                e.text, e.text.capitalize())

Source File: test_training_data.py From rasa_nlu with Apache License 2.0

6 votes

def test_dialogflow_data():
    td = training_data.load_data('data/examples/dialogflow/')
    assert len(td.entity_examples) == 5
    assert len(td.intent_examples) == 24
    assert len(td.training_examples) == 24
    assert len(td.lookup_tables) == 2
    assert td.intents == {"affirm", "goodbye", "hi", "inform"}
    assert td.entities == {"cuisine", "location"}
    non_trivial_synonyms = {k: v
                            for k, v in td.entity_synonyms.items() if k != v}
    assert non_trivial_synonyms == {"mexico": "mexican",
                                    "china": "chinese",
                                    "india": "indian"}
    # The order changes based on different computers hence the grouping
    assert {td.lookup_tables[0]['name'],
            td.lookup_tables[1]['name']} == {'location', 'cuisine'}
    assert {len(td.lookup_tables[0]['elements']),
            len(td.lookup_tables[1]['elements'])} == {4, 6}

Source File: time_train_test.py From rasa_lookup_demo with Apache License 2.0

6 votes

def train_model():
    # trains a model and times it
    t = time()
    # training_data = load_data('demo_train.md')
    training_data = load_data("data/company_train_lookup.json")
    td_load_time = time() - t
    trainer = Trainer(config.load("config.yaml"))
    t = time()
    trainer.train(training_data)
    train_time = time() - t
    clear_model_dir()
    t = time()
    model_directory = trainer.persist(
        "./tmp/models"
    )  # Returns the directory the model is stored in
    persist_time = time() - t
    return td_load_time, train_time, persist_time

Source File: test_interpreter.py From rasa_nlu with Apache License 2.0

6 votes

def test_interpreter(pipeline_template, component_builder, tmpdir):
    test_data = "data/examples/rasa/demo-rasa.json"
    _conf = utilities.base_test_conf(pipeline_template)
    _conf["data"] = test_data
    td = training_data.load_data(test_data)
    interpreter = utilities.interpreter_for(component_builder,
                                            "data/examples/rasa/demo-rasa.json",
                                            tmpdir.strpath,
                                            _conf)

    texts = ["good bye", "i am looking for an indian spot"]

    for text in texts:
        result = interpreter.parse(text, time=None)
        assert result['text'] == text
        assert (not result['intent']['name'] or
                result['intent']['name'] in td.intents)
        assert result['intent']['confidence'] >= 0
        # Ensure the model doesn't detect entity types that are not present
        # Models on our test data set are not stable enough to
        # require the exact entities to be found
        for entity in result['entities']:
            assert entity['entity'] in td.entities

Source File: test_training_data.py From rasa_nlu with Apache License 2.0

5 votes

def test_luis_data():
    td = training_data.load_data('data/examples/luis/demo-restaurants.json')
    assert len(td.entity_examples) == 8
    assert len(td.intent_examples) == 28
    assert len(td.training_examples) == 28
    assert td.entity_synonyms == {}
    assert td.intents == {"affirm", "goodbye", "greet", "inform"}
    assert td.entities == {"location", "cuisine"}

Source File: convert.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def convert_training_data(data_file, out_file, output_format, language):
    td = training_data.load_data(data_file, language)

    if output_format == 'md':
        output = td.as_markdown()
    else:
        output = td.as_json(indent=2)

    write_to_file(out_file, output)

Source File: evaluate.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def run_evaluation(data_path, model_path,
                   component_builder=None):  # pragma: no cover
    """Evaluate intent classification and entity extraction."""

    # get the metadata config from the package data
    interpreter = Interpreter.load(model_path, component_builder)
    test_data = training_data.load_data(data_path,
                                        interpreter.model_metadata.language)
    extractors = get_entity_extractors(interpreter)
    entity_predictions, tokens = get_entity_predictions(interpreter,
                                                        test_data)
    if duckling_extractors.intersection(extractors):
        entity_predictions = remove_duckling_entities(entity_predictions)
        extractors = remove_duckling_extractors(extractors)

    if is_intent_classifier_present(interpreter):
        intent_targets = get_intent_targets(test_data)
        intent_predictions = get_intent_predictions(interpreter, test_data)
        logger.info("Intent evaluation results:")
        evaluate_intents(intent_targets, intent_predictions)

    if extractors:
        entity_targets = get_entity_targets(test_data)

        logger.info("Entity evaluation results:")
        evaluate_entities(entity_targets, entity_predictions, tokens,
                          extractors)

Source File: train.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def do_train(cfg,  # type: RasaNLUModelConfig
             data,  # type: Text
             path=None,  # type: Optional[Text]
             project=None,  # type: Optional[Text]
             fixed_model_name=None,  # type: Optional[Text]
             storage=None,  # type: Optional[Text]
             component_builder=None,  # type: Optional[ComponentBuilder]
             url=None,  # type: Optional[Text]
             **kwargs  # type: Any
             ):
    # type: (...) -> Tuple[Trainer, Interpreter, Text]
    """Loads the trainer and the data and runs the training of the model."""

    # Ensure we are training a model that we can save in the end
    # WARN: there is still a race condition if a model with the same name is
    # trained in another subprocess
    trainer = Trainer(cfg, component_builder)
    persistor = create_persistor(storage)
    if url is not None:
        training_data = load_data_from_url(url, cfg.language)
    else:
        training_data = load_data(data, cfg.language)
    interpreter = trainer.train(training_data, **kwargs)

    if path:
        persisted_path = trainer.persist(path,
                                         persistor,
                                         project,
                                         fixed_model_name)
    else:
        persisted_path = None

    return trainer, interpreter, persisted_path

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_wit_data():
    td = training_data.load_data('data/examples/wit/demo-flights.json')
    assert len(td.entity_examples) == 4
    assert len(td.intent_examples) == 1
    assert len(td.training_examples) == 4
    assert td.entity_synonyms == {}
    assert td.intents == {"flight_booking"}
    assert td.entities == {"location", "datetime"}

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_dialogflow_data():
    td = training_data.load_data('data/examples/dialogflow/')
    assert len(td.entity_examples) == 5
    assert len(td.intent_examples) == 24
    assert len(td.training_examples) == 24
    assert td.intents == {"affirm", "goodbye", "hi", "inform"}
    assert td.entities == {"cuisine", "location"}
    non_trivial_synonyms = {k: v for k, v in td.entity_synonyms.items() if k != v}
    assert non_trivial_synonyms == {"mexico": "mexican",
                                    "china": "chinese",
                                    "india": "indian"}

Source File: convert.py From rasa_nlu with Apache License 2.0

5 votes

def convert_training_data(data_file, out_file, output_format, language):
    td = training_data.load_data(data_file, language)

    if output_format == 'md':
        output = td.as_markdown()
    else:
        output = td.as_json(indent=2)

    write_to_file(out_file, output)

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_data_merging(files):
    td_reference = training_data.load_data(files[0])
    td = training_data.load_data(files[1])
    assert len(td.entity_examples) == len(td_reference.entity_examples)
    assert len(td.intent_examples) == len(td_reference.intent_examples)
    assert len(td.training_examples) == len(td_reference.training_examples)
    assert td.intents == td_reference.intents
    assert td.entities == td_reference.entities
    assert td.entity_synonyms == td_reference.entity_synonyms
    assert td.regex_features == td_reference.regex_features

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_markdown_single_sections():
    td_regex_only = training_data.load_data('data/test/markdown_single_sections/regex_only.md')
    assert td_regex_only.regex_features == [{"name": "greet", "pattern": "hey[^\s]*"}]

    td_syn_only = training_data.load_data('data/test/markdown_single_sections/synonyms_only.md')
    assert td_syn_only.entity_synonyms == {'Chines': 'chinese',
                                           'Chinese': 'chinese'}

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_multiword_entities():
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "show me flights to New York City",
        "intent": "unk",
        "entities": [
          {
            "entity": "destination",
            "start": 19,
            "end": 32,
            "value": "New York City"
          }
        ]
      }
    ]
  }
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = training_data.load_data(f.name)
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example.get("entities")
        assert len(entities) == 1
        tokens = WhitespaceTokenizer().tokenize(example.text)
        start, end = MitieEntityExtractor.find_entity(entities[0],
                                                      example.text,
                                                      tokens)
        assert start == 4
        assert end == 7

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_nonascii_entities():
    data = """
{
  "luis_schema_version": "2.0",
  "utterances" : [
    {
      "text": "I am looking for a ßäæ ?€ö) item",
      "intent": "unk",
      "entities": [
        {
          "entity": "description",
          "startPos": 19,
          "endPos": 26
        }
      ]
    }
  ]
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = training_data.load_data(f.name)
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example.get("entities")
        assert len(entities) == 1
        entity = entities[0]
        assert entity["value"] == "ßäæ ?€ö)"
        assert entity["start"] == 19
        assert entity["end"] == 27
        assert entity["entity"] == "description"

Source File: test_training_data.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_training_data_conversion(tmpdir, data_file, gold_standard_file,
                                  output_format, language):
    out_path = tmpdir.join("rasa_nlu_data.json")
    convert_training_data(data_file, out_path.strpath, output_format, language)
    td = training_data.load_data(out_path.strpath, language)
    assert td.entity_examples != []
    assert td.intent_examples != []

    gold_standard = training_data.load_data(gold_standard_file, language)
    cmp_message_list(td.entity_examples, gold_standard.entity_examples)
    cmp_message_list(td.intent_examples, gold_standard.intent_examples)
    assert td.entity_synonyms == gold_standard.entity_synonyms

    # converting the converted file back to original
    # file format and performing the same tests
    rto_path = tmpdir.join("data_in_original_format.txt")
    convert_training_data(out_path.strpath, rto_path.strpath, 'json', language)
    rto = training_data.load_data(rto_path.strpath, language)
    cmp_message_list(gold_standard.entity_examples, rto.entity_examples)
    cmp_message_list(gold_standard.intent_examples, rto.intent_examples)
    assert gold_standard.entity_synonyms == rto.entity_synonyms

    # If the above assert fails - this can be used
    # to dump to the file and diff using git
    # with io.open(gold_standard_file) as f:
    #     f.write(td.as_json(indent=2))

Source File: test_evaluation.py From Rasa_NLU_Chi with Apache License 2.0

5 votes

def test_drop_intents_below_freq():
    td = training_data.load_data('data/examples/rasa/demo-rasa.json')
    clean_td = drop_intents_below_freq(td, 0)
    assert clean_td.intents == {'affirm', 'goodbye', 'greet',
                                'restaurant_search'}

    clean_td = drop_intents_below_freq(td, 10)
    assert clean_td.intents == {'affirm', 'restaurant_search'}

Source File: visualize.py From rasa_core with Apache License 2.0

5 votes

def visualize(config_path: Text, domain_path: Text, stories_path: Text,
                    nlu_data_path: Text, output_path: Text, max_history: int):
    from rasa.core.agent import Agent
    from rasa.core import config

    policies = config.load(config_path)

    agent = Agent(domain_path, policies=policies)

    # this is optional, only needed if the `/greet` type of
    # messages in the stories should be replaced with actual
    # messages (e.g. `hello`)
    if nlu_data_path is not None:
        from rasa_nlu.training_data import load_data

        nlu_data_path = load_data(nlu_data_path)
    else:
        nlu_data_path = None

    logger.info("Starting to visualize stories...")
    await agent.visualize(stories_path, output_path,
                          max_history,
                          nlu_training_data=nlu_data_path)

    full_output_path = "file://{}".format(os.path.abspath(output_path))
    logger.info("Finished graph creation. Saved into {}".format(
        full_output_path))

    import webbrowser
    webbrowser.open(full_output_path)

Source File: trainer.py From weather-bot with MIT License

5 votes

def train_nlu():
    training_data = load_data('data/nlu-data.md')
    trainer = Trainer(config.load("nlu-config.yml"))
    trainer.train(training_data)
    model_directory = trainer.persist('models/nlu/', fixed_model_name="current")
    return model_directory

Source File: bot.py From rasa_bot with Apache License 2.0

5 votes

def train_nlu():
    from rasa_nlu.training_data import load_data
    from rasa_nlu.config import RasaNLUModelConfig
    from rasa_nlu.model import Trainer
    from rasa_nlu import config

    training_data = load_data("data/nlu.json")
    trainer = Trainer(config.load("data/nlu_model_config.json"))
    trainer.train(training_data)
    model_directory = trainer.persist("models/", project_name="ivr", fixed_model_name="demo")

    return model_directory

Source File: test_training_data.py From rasa_nlu with Apache License 2.0

5 votes

def test_wit_data():
    td = training_data.load_data('data/examples/wit/demo-flights.json')
    assert len(td.entity_examples) == 4
    assert len(td.intent_examples) == 1
    assert len(td.training_examples) == 4
    assert td.entity_synonyms == {}
    assert td.intents == {"flight_booking"}
    assert td.entities == {"location", "datetime"}

Source File: test_training_data.py From rasa_nlu with Apache License 2.0

5 votes

def test_lookup_table_json():
    lookup_fname = 'data/test/lookup_tables/plates.txt'
    td_lookup = training_data.load_data(
        'data/test/lookup_tables/lookup_table.json')
    assert td_lookup.lookup_tables[0]['name'] == 'plates'
    assert td_lookup.lookup_tables[0]['elements'] == lookup_fname
    assert td_lookup.lookup_tables[1]['name'] == 'drinks'
    assert td_lookup.lookup_tables[1]['elements'] == [
        'mojito', 'lemonade', 'sweet berry wine', 'tea', 'club mate']

Source File: test_training_data.py From rasa_nlu with Apache License 2.0

5 votes

def test_lookup_table_md():
    lookup_fname = 'data/test/lookup_tables/plates.txt'
    td_lookup = training_data.load_data(
        'data/test/lookup_tables/lookup_table.md')
    assert td_lookup.lookup_tables[0]['name'] == 'plates'
    assert td_lookup.lookup_tables[0]['elements'] == lookup_fname
    assert td_lookup.lookup_tables[1]['name'] == 'drinks'
    assert td_lookup.lookup_tables[1]['elements'] == [
        'mojito', 'lemonade', 'sweet berry wine', 'tea', 'club mate']

Python rasa_nlu.training_data.load_data() Examples