Python pycrfsuite.Trainer() Examples
The following are 30
code examples of pycrfsuite.Trainer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pycrfsuite
, or try the search function
.
Example #1
Source File: crf.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def train(self, train_data, model_file): ''' Train the CRF tagger using CRFSuite :params train_data : is the list of annotated sentences. :type train_data : list (list(tuple(str,str))) :params model_file : the model will be saved to this file. ''' trainer = pycrfsuite.Trainer(verbose=self._verbose) trainer.set_params(self._training_options) for sent in train_data: tokens,labels = zip(*sent) features = [self._feature_func(tokens,i) for i in range(len(tokens))] trainer.append(features,labels) # Now train the model, the output should be model_file trainer.train(model_file) # Save the model file self.set_model_file(model_file)
Example #2
Source File: test_tagger.py From python-crfsuite with MIT License | 6 votes |
def test_tag_formats(tmpdir, xseq, yseq): # make all coefficients 1 and check that results are the same model_filename = str(tmpdir.join('model.crfsuite')) xseq = [dict((key, 1) for key in x) for x in xseq] trainer = Trainer() trainer.set('c2', 1e-6) # make sure model overfits trainer.append(xseq, yseq) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: assert tagger.tag(xseq) == yseq # strings with Tagger().open(model_filename) as tagger: data = [x.keys() for x in xseq] assert tagger.tag(data) == yseq
Example #3
Source File: crf.py From webQA_sequence_labelling_pytorch with MIT License | 6 votes |
def train_crf(x_train, y_train): print('Training...') trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(x_train, y_train): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 500, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train(param.crf_path)
Example #4
Source File: test_trainer.py From python-crfsuite with MIT License | 6 votes |
def test_training_messages(tmpdir, xseq, yseq): class CapturingTrainer(Trainer): def __init__(self): self.messages = [] def message(self, message): self.messages.append(message) trainer = CapturingTrainer() trainer.select('lbfgs') trainer.append(xseq, yseq) assert not trainer.messages model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) assert trainer.messages assert 'type: CRF1d\n' in trainer.messages # print("".join(trainer.messages))
Example #5
Source File: test_trainer.py From python-crfsuite with MIT License | 6 votes |
def test_training_messages_exception(tmpdir, xseq, yseq): class MyException(Exception): pass class BadTrainer(Trainer): def message(self, message): raise MyException("error") trainer = BadTrainer() trainer.select('lbfgs') trainer.append(xseq, yseq) model_filename = str(tmpdir.join('model.crfsuite')) with pytest.raises(MyException): trainer.train(model_filename)
Example #6
Source File: crfsuiteutil.py From estnltk with GNU General Public License v2.0 | 6 votes |
def train(self, nerdocs, mode_filename): """Train a CRF model using given documents. Parameters ---------- nerdocs: list of estnltk.estner.ner.Document. The documents for model training. mode_filename: str The fielname where to save the model. """ trainer = pycrfsuite.Trainer(algorithm=self.algorithm, params={'c2': self.c2}, verbose=self.verbose) for doc in nerdocs: for snt in doc.sentences: xseq = [t.feature_list() for t in snt] yseq = [t.label for t in snt] trainer.append(xseq, yseq) trainer.train(mode_filename)
Example #7
Source File: crf.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def train(self, train_data, model_file): ''' Train the CRF tagger using CRFSuite :params train_data : is the list of annotated sentences. :type train_data : list (list(tuple(str,str))) :params model_file : the model will be saved to this file. ''' trainer = pycrfsuite.Trainer(verbose=self._verbose) trainer.set_params(self._training_options) for sent in train_data: tokens, labels = zip(*sent) features = [self._feature_func(tokens, i) for i in range(len(tokens))] trainer.append(features, labels) # Now train the model, the output should be model_file trainer.train(model_file) # Save the model file self.set_model_file(model_file)
Example #8
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_help_invalid_parameter(): trainer = Trainer() trainer.select('l2sgd') # This segfaults without a workaround; # see https://github.com/chokkan/crfsuite/pull/21 with pytest.raises(ValueError): trainer.help('foo') with pytest.raises(ValueError): trainer.help('c1')
Example #9
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_params_and_help(): trainer = Trainer() trainer.select('lbfgs') assert 'c1' in trainer.params() assert 'c2' in trainer.params() assert 'num_memories' in trainer.params() assert 'L1' in trainer.help('c1') trainer.select('l2sgd') assert 'c2' in trainer.params() assert 'c1' not in trainer.params() assert 'L2' in trainer.help('c2')
Example #10
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_trainer_select_raises_error(): trainer = Trainer() with pytest.raises(ValueError): trainer.select('foo')
Example #11
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_get_parameter(): trainer = Trainer() trainer.select('l2sgd') assert abs(trainer.get('c2') - 0.1) > 1e-6 trainer.set('c2', 0.1) assert abs(trainer.get('c2') - 0.1) < 1e-6
Example #12
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_set_parameters_in_constructor(): trainer = Trainer(params={'c2': 100}) assert abs(trainer.get('c2') - 100) < 1e-6
Example #13
Source File: crf_trainer.py From MicroTokenizer with MIT License | 5 votes |
def __init__(self, feature_func_list=None): self.crf_trainer = pycrfsuite.Trainer(verbose=False) self.feature_func_list = feature_func_list if not self.feature_func_list: self.feature_func_list = default_feature_func_list
Example #14
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_trainer_noselect_noappend(tmpdir): # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21 trainer = Trainer() model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename)
Example #15
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_trainer_noappend(tmpdir): # This shouldn't segfault; see https://github.com/chokkan/crfsuite/pull/21 trainer = Trainer() trainer.select('lbfgs') model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename)
Example #16
Source File: test_trainer.py From python-crfsuite with MIT License | 5 votes |
def test_trainer(tmpdir, xseq, yseq): trainer = Trainer('lbfgs') trainer.append(xseq, yseq) model_filename = str(tmpdir.join('model.crfsuite')) assert not os.path.isfile(model_filename) trainer.train(model_filename) assert os.path.isfile(model_filename)
Example #17
Source File: CRF.py From indic_tagger with Apache License 2.0 | 5 votes |
def __init__(self, model_path): self.trainer = pycrfsuite.Trainer(verbose=False) self.model_path = model_path self.trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True })
Example #18
Source File: conftest.py From python-crfsuite with MIT License | 5 votes |
def model_filename(tmpdir, xseq, yseq): from pycrfsuite import Trainer trainer = Trainer('lbfgs', verbose=False) trainer.append(xseq, yseq) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) return model_filename
Example #19
Source File: test_tagger.py From python-crfsuite with MIT License | 5 votes |
def test_append_strstr_dicts(tmpdir): trainer = Trainer() trainer.append( [{'foo': 'bar'}, {'baz': False}, {'foo': 'bar', 'baz': True}, {'baz': 0.2}], ['spam', 'egg', 'spam', 'spam'] ) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set(['foo:bar', 'baz']) assert info.state_features[('foo:bar', 'spam')] > 0
Example #20
Source File: entity_extractor_worker.py From texta with GNU General Public License v3.0 | 5 votes |
def _train_and_save(self, X_train, y_train): trainer = Trainer(verbose=False) for i, (xseq, yseq) in enumerate(zip(X_train, y_train)): # Check how much memory left, stop adding more data if too little if i % 2500 == 0: if (psutil.virtual_memory().available / 1000000) < self.min_mb_available_memory: print('EntityExtractorWorker:_get_memory_safe_features - Less than {} Mb of memory remaining, breaking adding more data.'.format(self.min_mb_available_memory)) self.train_summary["warning"] = "Trained on {} documents, because more documents don't fit into memory".format(i) log_dict = { 'task': 'EntityExtractorWorker:_train_and_save', 'event': 'Less than {}Mb of memory available, stopping adding more training data. Iteration {}.'.format(self.min_mb_available_memory, i), 'data': {'task_id': self.task_id} } self.info_logger.info("Memory", extra=log_dict) break trainer.append(xseq, yseq) trainer.set_params({ 'c1': 0.5, # coefficient for L1 penalty 'c2': 1e-4, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # transitions that are possible, but not observed 'feature.possible_transitions': True}) output_model_path = create_file_path(self.model_name, MODELS_DIR, self.task_type) # Train and save trainer.train(output_model_path) return trainer
Example #21
Source File: crf.py From webQA_sequence_labelling_pytorch with MIT License | 5 votes |
def test(): X_train = [[{'foo': 1, 'bar': 0, 's':0, 'p': 4, 'd':True, 'a':0.7, 'b': 0.5, 'c': 9}, {'foo': 0, 'baz': 1, 's':0, 'p': 0, 'd': False, 'a':8.7, 'b': 7.5, 'c': 1}]] X_train = [[['foo=1', 'bar=0', 'c=9', 's=0', 'sd=12', 'cd=2', 'ca=3', 'd=True', 'cc=89'], ['foo=4', 'bar=7', 'c=3', 's=1', 'sd=8', 'cd=9', 'ca=1','d=False', 'cc=18']]] y_train = [['0', '1']] #print('x train: ', y_train[0]) trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(X_train, y_train): print('x: ', xseq) print('y: ', yseq) trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 500, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('conll2002-esp.crfsuite') #print (len(trainer.logparser.iterations), trainer.logparser.iterations[-1]) tagger = pycrfsuite.Tagger() tagger.open('conll2002-esp.crfsuite') print("Predicted:", ' '.join(tagger.tag(X_train[0]))) print("Correct: ", ' '.join(y_train[0]))
Example #22
Source File: entity_extractor.py From ai-chatbot-framework with MIT License | 5 votes |
def train(self, train_sentences, model_name): """ Train NER model for given model :param train_sentences: :param model_name: :return: """ features = [self.sent_to_features(s) for s in train_sentences] labels = [self.sent_to_labels(s) for s in train_sentences] trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(features, labels): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier # include transitions that are possible, but not observed 'feature.possible_transitions': True }) trainer.train('model_files/%s.model' % model_name) return True # Extract Labels from BIO tagged sentence
Example #23
Source File: extractor.py From HotPepperGourmetDialogue with MIT License | 5 votes |
def train(self, train_x, train_y, save_file='model.crfsuite'): trainer = pycrfsuite.Trainer(verbose=False) for xseq, yseq in zip(train_x, train_y): trainer.append(xseq, yseq) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier 'feature.possible_transitions': True }) trainer.train(save_file) self.__tagger.open(save_file)
Example #24
Source File: tag.py From ChemDataExtractor with MIT License | 5 votes |
def train(self, sentences, model): """Train the CRF tagger using CRFSuite. :params sentences: Annotated sentences. :params model: Path to save pickled model. """ trainer = pycrfsuite.Trainer(verbose=True) trainer.set_params(self.params) for sentence in sentences: tokens, labels = zip(*sentence) features = [self._get_features(tokens, i) for i in range(len(tokens))] trainer.append(features, labels) trainer.train(model) self.load(model)
Example #25
Source File: crf_pos_tagger.py From Jiayan with MIT License | 5 votes |
def train(self, train_x, train_y, out_model): trainer = pycrfsuite.Trainer(verbose=False) for x, y in zip(train_x, train_y): if x and y: trainer.append(x, y) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier 'feature.possible_transitions': True # include transitions that are possible, but not observed }) trainer.train(out_model) print(trainer.logparser.last_iteration)
Example #26
Source File: crf_sent_tagger.py From Jiayan with MIT License | 5 votes |
def train(self, train_x, train_y, out_model): trainer = pycrfsuite.Trainer(verbose=False) for x, y in zip(train_x, train_y): if x and y: trainer.append(x, y) trainer.set_params({ 'c1': 1.0, # coefficient for L1 penalty 'c2': 1e-3, # coefficient for L2 penalty 'max_iterations': 50, # stop earlier 'feature.possible_transitions': True # include transitions that are possible, but not observed }) trainer.train(out_model) print(trainer.logparser.last_iteration)
Example #27
Source File: pycrfsuite.py From TBBTCorpus with Apache License 2.0 | 5 votes |
def __init__(self, enumerations=100, L1Penalty=1.0, L2Penalty=1e-3): self.crf_feature_train = crf.Trainer(verbose=False) self.crf_feature_train.set_params({ 'c1': L1Penalty, 'c2': L2Penalty, 'max_iterations': enumerations, 'feature.possible_transitions': True }) #Method to append more features to the trainer #More features include TOKEN and its respective POS #It also includes the act tag of the sentence
Example #28
Source File: training.py From parserator with MIT License | 5 votes |
def trainModel(training_data, module, model_path, params_to_set={'c1':0.1, 'c2':0.01, 'feature.minfreq':0}): trainer = pycrfsuite.Trainer(verbose=False, params=params_to_set) for _, components in training_data: tokens, labels = list(zip(*components)) trainer.append(module.tokens2features(tokens), labels) trainer.train(model_path)
Example #29
Source File: utils.py From parserator with MIT License | 5 votes |
def fit(self, X, y, **params, model_path): # sklearn requires parameters to be declared as fields of the estimator, # an we can't have a full stop there. Replace with an underscore params = {k.replace('_', '.'): v for k, v in self.__dict__.items()} trainer = pycrfsuite.Trainer(verbose=False, params=params) for raw_text, labels in zip(X, y): tokens = tokenize(raw_text) trainer.append(tokens2features(tokens), labels) trainer.train(model_path) reload(parserator)
Example #30
Source File: test_tagger.py From python-crfsuite with MIT License | 4 votes |
def test_append_nested_dicts(tmpdir): trainer = Trainer() trainer.append( [ { "foo": { "bar": "baz", "spam": 0.5, "egg": ["x", "y"], "ham": {"x": -0.5, "y": -0.1} }, }, { "foo": { "bar": "ham", "spam": -0.5, "ham": set(["x", "y"]) }, }, ], ['first', 'second'] ) model_filename = str(tmpdir.join('model.crfsuite')) trainer.train(model_filename) with Tagger().open(model_filename) as tagger: info = tagger.info() assert set(info.attributes.keys()) == set([ 'foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y', 'foo:ham:x', 'foo:ham:y', 'foo:bar:ham', ]) for feat in ['foo:bar:baz', 'foo:spam', 'foo:egg:x', 'foo:egg:y']: assert info.state_features[(feat, 'first')] > 0 assert info.state_features.get((feat, 'second'), 0) <= 0 for feat in ['foo:bar:ham', 'foo:ham:x', 'foo:ham:y']: assert info.state_features[(feat, 'second')] > 0 assert info.state_features.get((feat, 'first'), 0) <= 0