Python Examples of pycrfsuite.Trainer

Source File: crf.py From razzy-spinner with GNU General Public License v3.0

6 votes

def train(self, train_data, model_file):
        '''
        Train the CRF tagger using CRFSuite  
        :params train_data : is the list of annotated sentences.        
        :type train_data : list (list(tuple(str,str)))
        :params model_file : the model will be saved to this file.     
         
        '''
        trainer = pycrfsuite.Trainer(verbose=self._verbose)
        trainer.set_params(self._training_options)
        
        for sent in train_data:
            tokens,labels = zip(*sent)
            features = [self._feature_func(tokens,i) for i in range(len(tokens))]
            trainer.append(features,labels)
                        
        # Now train the model, the output should be model_file
        trainer.train(model_file)
        # Save the model file
        self.set_model_file(model_file)

Source File: test_tagger.py From python-crfsuite with MIT License

6 votes

def test_tag_formats(tmpdir, xseq, yseq):
    # make all coefficients 1 and check that results are the same
    model_filename = str(tmpdir.join('model.crfsuite'))
    xseq = [dict((key, 1) for key in x) for x in xseq]

    trainer = Trainer()
    trainer.set('c2', 1e-6)  # make sure model overfits
    trainer.append(xseq, yseq)
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        assert tagger.tag(xseq) == yseq

    # strings
    with Tagger().open(model_filename) as tagger:
        data = [x.keys() for x in xseq]
        assert tagger.tag(data) == yseq

Source File: crf.py From webQA_sequence_labelling_pytorch with MIT License

6 votes

def train_crf(x_train, y_train):
    print('Training...')
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(x_train, y_train):
        trainer.append(xseq, yseq)
    
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 500,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })

    trainer.train(param.crf_path)

Source File: test_trainer.py From python-crfsuite with MIT License

6 votes

def test_training_messages(tmpdir, xseq, yseq):

    class CapturingTrainer(Trainer):
        def __init__(self):
            self.messages = []

        def message(self, message):
            self.messages.append(message)

    trainer = CapturingTrainer()
    trainer.select('lbfgs')
    trainer.append(xseq, yseq)
    assert not trainer.messages

    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
    assert trainer.messages
    assert 'type: CRF1d\n' in trainer.messages
    # print("".join(trainer.messages))

Source File: test_trainer.py From python-crfsuite with MIT License

6 votes

def test_training_messages_exception(tmpdir, xseq, yseq):

    class MyException(Exception):
        pass

    class BadTrainer(Trainer):
        def message(self, message):
            raise MyException("error")

    trainer = BadTrainer()
    trainer.select('lbfgs')
    trainer.append(xseq, yseq)

    model_filename = str(tmpdir.join('model.crfsuite'))

    with pytest.raises(MyException):
        trainer.train(model_filename)

Source File: crfsuiteutil.py From estnltk with GNU General Public License v2.0

6 votes

def train(self, nerdocs, mode_filename):
        """Train a CRF model using given documents.

        Parameters
        ----------
        nerdocs: list of estnltk.estner.ner.Document.
            The documents for model training.
        mode_filename: str
            The fielname where to save the model.
        """

        trainer = pycrfsuite.Trainer(algorithm=self.algorithm,
                                     params={'c2': self.c2},
                                     verbose=self.verbose)

        for doc in nerdocs:
            for snt in doc.sentences:
                xseq = [t.feature_list() for t in snt]
                yseq = [t.label for t in snt]
                trainer.append(xseq, yseq)

        trainer.train(mode_filename)

Source File: crf.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def train(self, train_data, model_file):
        '''
        Train the CRF tagger using CRFSuite
        :params train_data : is the list of annotated sentences.
        :type train_data : list (list(tuple(str,str)))
        :params model_file : the model will be saved to this file.

        '''
        trainer = pycrfsuite.Trainer(verbose=self._verbose)
        trainer.set_params(self._training_options)

        for sent in train_data:
            tokens, labels = zip(*sent)
            features = [self._feature_func(tokens, i) for i in range(len(tokens))]
            trainer.append(features, labels)

        # Now train the model, the output should be model_file
        trainer.train(model_file)
        # Save the model file
        self.set_model_file(model_file)

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_help_invalid_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')

    # This segfaults without a workaround;
    # see https://github.com/chokkan/crfsuite/pull/21
    with pytest.raises(ValueError):
        trainer.help('foo')

    with pytest.raises(ValueError):
        trainer.help('c1')

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_params_and_help():
    trainer = Trainer()

    trainer.select('lbfgs')
    assert 'c1' in trainer.params()
    assert 'c2' in trainer.params()
    assert 'num_memories' in trainer.params()
    assert 'L1' in trainer.help('c1')

    trainer.select('l2sgd')
    assert 'c2' in trainer.params()
    assert 'c1' not in trainer.params()
    assert 'L2' in trainer.help('c2')

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_trainer_select_raises_error():
    trainer = Trainer()
    with pytest.raises(ValueError):
        trainer.select('foo')

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_get_parameter():
    trainer = Trainer()
    trainer.select('l2sgd')
    assert abs(trainer.get('c2') - 0.1) > 1e-6
    trainer.set('c2', 0.1)
    assert abs(trainer.get('c2') - 0.1) < 1e-6

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_set_parameters_in_constructor():
    trainer = Trainer(params={'c2': 100})
    assert abs(trainer.get('c2') - 100) < 1e-6

Source File: crf_trainer.py From MicroTokenizer with MIT License

5 votes

def __init__(self, feature_func_list=None):
        self.crf_trainer = pycrfsuite.Trainer(verbose=False)

        self.feature_func_list = feature_func_list

        if not self.feature_func_list:
            self.feature_func_list = default_feature_func_list

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_trainer_noselect_noappend(tmpdir):
    # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
    trainer = Trainer()
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_trainer_noappend(tmpdir):
    # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21
    trainer = Trainer()
    trainer.select('lbfgs')
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

Source File: test_trainer.py From python-crfsuite with MIT License

5 votes

def test_trainer(tmpdir, xseq, yseq):
    trainer = Trainer('lbfgs')
    trainer.append(xseq, yseq)

    model_filename = str(tmpdir.join('model.crfsuite'))
    assert not os.path.isfile(model_filename)
    trainer.train(model_filename)
    assert os.path.isfile(model_filename)

Source File: CRF.py From indic_tagger with Apache License 2.0

5 votes

def __init__(self, model_path):
		self.trainer = pycrfsuite.Trainer(verbose=False)
		self.model_path = model_path
		self.trainer.set_params({
            'c1': 1.0,   # coefficient for L1 penalty
            'c2': 1-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
            })

Source File: conftest.py From python-crfsuite with MIT License

5 votes

def model_filename(tmpdir, xseq, yseq):
    from pycrfsuite import Trainer
    trainer = Trainer('lbfgs', verbose=False)
    trainer.append(xseq, yseq)
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)
    return model_filename

Source File: test_tagger.py From python-crfsuite with MIT License

5 votes

def test_append_strstr_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [{'foo': 'bar'}, {'baz': False}, {'foo': 'bar', 'baz': True}, {'baz': 0.2}],
        ['spam', 'egg', 'spam', 'spam']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set(['foo:bar', 'baz'])
        assert info.state_features[('foo:bar', 'spam')] > 0

Source File: entity_extractor_worker.py From texta with GNU General Public License v3.0

5 votes

def _train_and_save(self, X_train, y_train):
        trainer = Trainer(verbose=False)
        for i, (xseq, yseq) in enumerate(zip(X_train, y_train)):
            # Check how much memory left, stop adding more data if too little
            if i % 2500 == 0:
                if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory:
                    print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory))
                    self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i)

                    log_dict = {
                        'task': 'EntityExtractorWorker:_train_and_save',
                        'event': 'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i),
                        'data': {'task_id': self.task_id}
                    }
                    self.info_logger.info("Memory", extra=log_dict)
                    break
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 0.5,  # coefficient for L1 penalty
            'c2': 1e-4,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            # transitions that are possible, but not observed
            'feature.possible_transitions': True})

        output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type)
        # Train and save
        trainer.train(output_model_path)
        return trainer

Source File: crf.py From webQA_sequence_labelling_pytorch with MIT License

5 votes

def test():
    X_train = [[{'foo': 1, 'bar': 0, 's':0, 'p': 4, 'd':True, 'a':0.7, 'b': 0.5, 'c': 9}, 
            {'foo': 0, 'baz': 1, 's':0, 'p': 0, 'd': False, 'a':8.7, 'b': 7.5, 'c': 1}]]
    X_train = [[['foo=1', 'bar=0', 'c=9', 's=0', 'sd=12', 'cd=2', 'ca=3', 'd=True', 'cc=89'], 
            ['foo=4', 'bar=7', 'c=3', 's=1', 'sd=8', 'cd=9', 'ca=1','d=False', 'cc=18']]]
    y_train = [['0', '1']]
    #print('x train: ', y_train[0])


    trainer = pycrfsuite.Trainer(verbose=False)

    for xseq, yseq in zip(X_train, y_train):
        print('x: ', xseq)
        print('y: ', yseq)
        trainer.append(xseq, yseq)

    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 500,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })



    trainer.train('conll2002-esp.crfsuite')
    #print (len(trainer.logparser.iterations), trainer.logparser.iterations[-1])


    tagger = pycrfsuite.Tagger()
    tagger.open('conll2002-esp.crfsuite')

    print("Predicted:", ' '.join(tagger.tag(X_train[0])))
    print("Correct:  ", ' '.join(y_train[0]))

Source File: entity_extractor.py From ai-chatbot-framework with MIT License

5 votes

def train(self, train_sentences, model_name):
        """
        Train NER model for given model
        :param train_sentences:
        :param model_name:
        :return:
        """
        features = [self.sent_to_features(s) for s in train_sentences]
        labels = [self.sent_to_labels(s) for s in train_sentences]

        trainer = pycrfsuite.Trainer(verbose=False)
        for xseq, yseq in zip(features, labels):
            trainer.append(xseq, yseq)

        trainer.set_params({
            'c1': 1.0,  # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier

            # include transitions that are possible, but not observed
            'feature.possible_transitions': True
        })
        trainer.train('model_files/%s.model' % model_name)
        return True

    # Extract Labels from BIO tagged sentence

Source File: extractor.py From HotPepperGourmetDialogue with MIT License

5 votes

def train(self, train_x, train_y, save_file='model.crfsuite'):
        trainer = pycrfsuite.Trainer(verbose=False)
        for xseq, yseq in zip(train_x, train_y):
            trainer.append(xseq, yseq)
        trainer.set_params({
            'c1': 1.0,   # coefficient for L1 penalty
            'c2': 1e-3,  # coefficient for L2 penalty
            'max_iterations': 50,  # stop earlier
            'feature.possible_transitions': True
        })
        trainer.train(save_file)
        self.__tagger.open(save_file)

Source File: tag.py From ChemDataExtractor with MIT License

5 votes

def train(self, sentences, model):
        """Train the CRF tagger using CRFSuite.

        :params sentences: Annotated sentences.
        :params model: Path to save pickled model.
        """
        trainer = pycrfsuite.Trainer(verbose=True)
        trainer.set_params(self.params)
        for sentence in sentences:
            tokens, labels = zip(*sentence)
            features = [self._get_features(tokens, i) for i in range(len(tokens))]
            trainer.append(features, labels)
        trainer.train(model)
        self.load(model)

Source File: crf_pos_tagger.py From Jiayan with MIT License

5 votes

def train(self, train_x, train_y, out_model):
        trainer = pycrfsuite.Trainer(verbose=False)
        for x, y in zip(train_x, train_y):
            if x and y:
                trainer.append(x, y)

        trainer.set_params({
            'c1': 1.0,                            # coefficient for L1 penalty
            'c2': 1e-3,                           # coefficient for L2 penalty
            'max_iterations': 50,                 # stop earlier
            'feature.possible_transitions': True  # include transitions that are possible, but not observed
        })

        trainer.train(out_model)
        print(trainer.logparser.last_iteration)

Source File: crf_sent_tagger.py From Jiayan with MIT License

5 votes

def train(self, train_x, train_y, out_model):
        trainer = pycrfsuite.Trainer(verbose=False)
        for x, y in zip(train_x, train_y):
            if x and y:
                trainer.append(x, y)

        trainer.set_params({
            'c1': 1.0,                            # coefficient for L1 penalty
            'c2': 1e-3,                           # coefficient for L2 penalty
            'max_iterations': 50,                 # stop earlier
            'feature.possible_transitions': True  # include transitions that are possible, but not observed
        })

        trainer.train(out_model)
        print(trainer.logparser.last_iteration)

Source File: pycrfsuite.py From TBBTCorpus with Apache License 2.0

5 votes

def __init__(self, enumerations=100, L1Penalty=1.0, L2Penalty=1e-3):
        self.crf_feature_train = crf.Trainer(verbose=False)
        self.crf_feature_train.set_params({
            'c1': L1Penalty,
            'c2': L2Penalty,
            'max_iterations': enumerations,
            'feature.possible_transitions': True
        })
    
    #Method to append more features to the trainer
    #More features include TOKEN and its respective POS
    #It also includes the act tag of the sentence

Source File: training.py From parserator with MIT License

5 votes

def trainModel(training_data, module, model_path,
               params_to_set={'c1':0.1, 'c2':0.01, 'feature.minfreq':0}):

    trainer = pycrfsuite.Trainer(verbose=False, params=params_to_set)

    for _, components in training_data:
        tokens, labels = list(zip(*components))
        trainer.append(module.tokens2features(tokens), labels)

    trainer.train(model_path)

Source File: utils.py From parserator with MIT License

5 votes

def fit(self, X, y, **params, model_path):
        # sklearn requires parameters to be declared as fields of the estimator,
        # an we can't have a full stop there. Replace with an underscore
        params = {k.replace('_', '.'): v for k, v in self.__dict__.items()}
        trainer = pycrfsuite.Trainer(verbose=False, params=params)
        for raw_text, labels in zip(X, y):
            tokens = tokenize(raw_text)
            trainer.append(tokens2features(tokens), labels)
        trainer.train(model_path)
        reload(parserator)

Source File: test_tagger.py From python-crfsuite with MIT License

4 votes

def test_append_nested_dicts(tmpdir):
    trainer = Trainer()
    trainer.append(
        [
            {
                "foo": {
                    "bar": "baz",
                    "spam": 0.5,
                    "egg": ["x", "y"],
                    "ham": {"x": -0.5, "y": -0.1}
                },
            },
            {
                "foo": {
                    "bar": "ham",
                    "spam": -0.5,
                    "ham": set(["x", "y"])
                },
            },
        ],
        ['first', 'second']
    )
    model_filename = str(tmpdir.join('model.crfsuite'))
    trainer.train(model_filename)

    with Tagger().open(model_filename) as tagger:
        info = tagger.info()
        assert set(info.attributes.keys()) == set([
            'foo:bar:baz',
            'foo:spam',
            'foo:egg:x',
            'foo:egg:y',
            'foo:ham:x',
            'foo:ham:y',
            'foo:bar:ham',
        ])

        for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']:
            assert info.state_features[(feat, 'first')] > 0
            assert info.state_features.get((feat, 'second'), 0) <= 0

        for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']:
            assert info.state_features[(feat, 'second')] > 0
            assert info.state_features.get((feat, 'first'), 0) <= 0

Python pycrfsuite.Trainer() Examples