Python nltk.classify.NaiveBayesClassifier.train() Examples
The following are 30
code examples of nltk.classify.NaiveBayesClassifier.train().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.classify.NaiveBayesClassifier
, or try the search function
.
Example #1
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def split_train_test(all_instances, n=None): """ Randomly split `n` instances of the dataset into train and test sets. :param all_instances: a list of instances (e.g. documents) that will be split. :param n: the number of instances to consider (in case we want to use only a subset). :return: two lists of instances. Train set is 8/10 of the total and test set is 2/10 of the total. """ random.seed(12345) random.shuffle(all_instances) if not n or n > len(all_instances): n = len(all_instances) train_set = all_instances[:int(.8*n)] test_set = all_instances[int(.8*n):n] return train_set, test_set
Example #2
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def demo_sent_subjectivity(text): """ Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. """ from nltk.classify import NaiveBayesClassifier from nltk.tokenize import regexp word_tokenizer = regexp.WhitespaceTokenizer() try: sentim_analyzer = load('sa_subjectivity.pickle') except LookupError: print('Cannot find the sentiment analyzer you want to load.') print('Training a new one using NaiveBayesClassifier.') sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) # Tokenize and convert to lower case tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] print(sentim_analyzer.classify(tokenized_text))
Example #3
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def demo_sent_subjectivity(text): """ Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. """ from nltk.classify import NaiveBayesClassifier from nltk.tokenize import regexp word_tokenizer = regexp.WhitespaceTokenizer() try: sentim_analyzer = load('sa_subjectivity.pickle') except LookupError: print('Cannot find the sentiment analyzer you want to load.') print('Training a new one using NaiveBayesClassifier.') sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) # Tokenize and convert to lower case tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] print(sentim_analyzer.classify(tokenized_text))
Example #4
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def split_train_test(all_instances, n=None): """ Randomly split `n` instances of the dataset into train and test sets. :param all_instances: a list of instances (e.g. documents) that will be split. :param n: the number of instances to consider (in case we want to use only a subset). :return: two lists of instances. Train set is 8/10 of the total and test set is 2/10 of the total. """ random.seed(12345) random.shuffle(all_instances) if not n or n > len(all_instances): n = len(all_instances) train_set = all_instances[: int(0.8 * n)] test_set = all_instances[int(0.8 * n) : n] return train_set, test_set
Example #5
Source File: sequential.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def __init__( self, train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False, ): self._check_params(train, model) ContextTagger.__init__(self, model, backoff) self._affix_length = affix_length self._min_word_length = min_stem_length + abs(affix_length) if train: self._train(train, cutoff, verbose)
Example #6
Source File: sequential.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
Example #7
Source File: nonprojectivedependencyparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def nonprojective_conll_parse_demo(): from nltk.parse.dependencygraph import conll_data2 graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry] npp = ProbabilisticNonprojectiveParser() npp.train(graphs, NaiveBayesDependencyScorer()) for parse_graph in npp.parse( ['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc'] ): print(parse_graph)
Example #8
Source File: nonprojectivedependencyparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def hall_demo(): npp = ProbabilisticNonprojectiveParser() npp.train([], DemoScorer()) for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]): print(parse_graph)
Example #9
Source File: nonprojectivedependencyparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def train(self, graphs): print('Training...')
Example #10
Source File: nonprojectivedependencyparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def train(self, graphs): """ Trains a ``NaiveBayesClassifier`` using the edges present in graphs list as positive examples, the edges not present as negative examples. Uses a feature vector of head-word, head-tag, child-word, and child-tag. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. """ from nltk.classify import NaiveBayesClassifier # Create training labeled training examples labeled_examples = [] for graph in graphs: for head_node in graph.nodes.values(): for child_index, child_node in graph.nodes.items(): if child_index in head_node['deps']: label = "T" else: label = "F" labeled_examples.append( ( dict( a=head_node['word'], b=head_node['tag'], c=child_node['word'], d=child_node['tag'], ), label, ) ) self.classifier = NaiveBayesClassifier.train(labeled_examples)
Example #11
Source File: nonprojectivedependencyparser.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def train(self, graphs): """ :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. Typically the edges present in the graphs can be used as positive training examples, and the edges not present as negative examples. """ raise NotImplementedError()
Example #12
Source File: sequential.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose)
Example #13
Source File: sequential.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose)
Example #14
Source File: sequential.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose)
Example #15
Source File: sequential.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__( self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False ): self._n = n self._check_params(train, model) ContextTagger.__init__(self, model, backoff) if train: self._train(train, cutoff, verbose)
Example #16
Source File: supervised.py From indosum with Apache License 2.0 | 5 votes |
def train(cls, tagged_vecs: Collection[Tuple[np.ndarray, int]]) -> '_GaussianEmission': by_tag: dict = defaultdict(list) for vec, tag in tagged_vecs: by_tag[tag].append(vec) mean_dict = {} matrices = [] for tag, vecs in by_tag.items(): mean = mean_dict[tag] = np.mean(vecs, axis=0) for vec in vecs: v = (vec - mean).reshape(-1, 1) matrices.append(v.dot(v.T)) cov = np.mean(matrices, axis=0) return cls(mean_dict, cov)
Example #17
Source File: supervised.py From indosum with Apache License 2.0 | 5 votes |
def train(cls, docs: Collection[Document], cutoff: float = 0.1, idf_table: Optional[Mapping[Word, float]] = None, ) -> 'NaiveBayesSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. cutoff (float): Cutoff for signature words. idf_table (Mapping[Word, float]): Precomputed IDF table. If not given, the IDF will be computed from ``docs``. Returns: NaiveBayes: The trained model. """ # Find signature words idf = cls._compute_idf(docs) if idf_table is None else idf_table n_cutoff = int(cutoff * len(idf)) signature_words = set(sorted( idf.keys(), key=lambda w: idf[w], reverse=True)[:n_cutoff]) train_data = [] # type: list for doc in docs: featuresets = cls._extract_featuresets(doc, signature_words) labels = [sent.label for sent in doc.sentences] train_data.extend(zip(featuresets, labels)) return cls( NaiveBayesClassifier.train(train_data), signature_words=signature_words)
Example #18
Source File: nonprojectivedependencyparser.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def nonprojective_conll_parse_demo(): from nltk.parse.dependencygraph import conll_data2 graphs = [ DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry ] npp = ProbabilisticNonprojectiveParser() npp.train(graphs, NaiveBayesDependencyScorer()) for parse_graph in npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']): print(parse_graph)
Example #19
Source File: nonprojectivedependencyparser.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def hall_demo(): npp = ProbabilisticNonprojectiveParser() npp.train([], DemoScorer()) for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]): print(parse_graph)
Example #20
Source File: nonprojectivedependencyparser.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def train(self, graphs, dependency_scorer): """ Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects, and establishes this as the parser's scorer. This is used to initialize the scores on a ``DependencyGraph`` during the parsing procedure. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. :type dependency_scorer: DependencyScorerI :param dependency_scorer: A scorer which implements the ``DependencyScorerI`` interface. """ self._scorer = dependency_scorer self._scorer.train(graphs)
Example #21
Source File: nonprojectivedependencyparser.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def train(self, graphs): print('Training...')
Example #22
Source File: nonprojectivedependencyparser.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def train(self, graphs): """ :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. Typically the edges present in the graphs can be used as positive training examples, and the edges not present as negative examples. """ raise NotImplementedError()
Example #23
Source File: sequential.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False): self._check_params(train, model) ContextTagger.__init__(self, model, backoff) self._affix_length = affix_length self._min_word_length = min_stem_length + abs(affix_length) if train: self._train(train, cutoff, verbose)
Example #24
Source File: sequential.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose)
Example #25
Source File: sequential.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose)
Example #26
Source File: sequential.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def __init__(self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False): self._n = n self._check_params(train, model) ContextTagger.__init__(self, model, backoff) if train: self._train(train, cutoff, verbose)
Example #27
Source File: supervised.py From indosum with Apache License 2.0 | 4 votes |
def train(cls, docs: Collection[Document], stopwords: Optional[Collection[Word]] = None, algorithm: str = 'iis', cutoff: int = 4, sigma: float = 0., trim_length: int = 10, ) -> 'MaxentSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. stopwords (Collection[Word]): Collection of stopwords. algorithm (str): Optimization algorithm for training. Possible values are 'iis', 'gis', or 'megam' (requires `megam`_ to be installed). cutoff (int): Features that occur fewer than this value in the training data will be discarded. sigma (float): Standard deviation for the Gaussian prior. Default is no prior. trim_length (int): Trim words to this length. Returns: MaxEntropy: The trained model. .. _megam: https://www.umiacs.umd.edu/~hal/megam/ """ if stopwords is None: stopwords = set() word_pairs = {pair for doc in docs for sent in doc.sentences for pair in cls._get_word_pairs(sent, stopwords, trim_len=trim_length)} train_data: list = [] for doc in docs: featuresets = cls._extract_featuresets(doc, stopwords, word_pairs, trim_length) labels = [sent.label for sent in doc.sentences] train_data.extend(zip(featuresets, labels)) encoding = BinaryMaxentFeatureEncoding.train( train_data, count_cutoff=cutoff, alwayson_features=True) classifier = MaxentClassifier.train( train_data, algorithm=algorithm, encoding=encoding, gaussian_prior_sigma=sigma) return cls(classifier, stopwords=stopwords, word_pairs=word_pairs)
Example #28
Source File: supervised.py From indosum with Apache License 2.0 | 4 votes |
def train(cls, docs: Collection[Document], gamma_word: float = 0.1, gamma_init: float = 0.1, gamma_trans: float = 0.1, tf_table: Optional[Mapping[Word, float]] = None, ) -> 'HMMSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. gamma_word (float): Smoothing value for the "word probability in a document" feature. gamma_init (float): Smoothing value for the initial probability. gamma_trans (float): Smoothing value for the transition probability. tf_table (Mapping[Word, float]): A precomputed term-frequency table that is already normalized. Returns: HMM: The trained model. """ init_fdist = FreqDist() trans_fdist = ConditionalFreqDist() tagged_vecs: list = [] states = set() for doc in docs: tags = cls._get_tags(doc.sentences) if not tags: continue init_fdist[tags[0]] += 1 for prev, tag in zip(tags, tags[1:]): trans_fdist[prev][tag] += 1 vecs = cls._get_feature_vectors(doc, gamma_word, tf=tf_table) tagged_vecs.extend(zip(vecs, tags)) states.update(tags) # Initial probability init_pdist = LidstoneProbDist(init_fdist, gamma_init, bins=len(states)) # Transition probability trans_pdist = ConditionalProbDist( trans_fdist, LidstoneProbDist, gamma_trans, bins=len(states)) # Emission probability emit_pdist = _GaussianEmission.train(tagged_vecs) return cls( init_pdist, trans_pdist, emit_pdist, list(states), gamma=gamma_word, tf_table=tf_table)
Example #29
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): """ Train and test a classifier on instances of the Subjective Dataset by Pang and Lee. The dataset is made of 5000 subjective and 5000 objective sentences. All tokens (words and punctuation marks) are separated by a whitespace, so we use the basic WhitespaceTokenizer to parse the data. :param trainer: `train` method of a classifier. :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. :param n_instances: the number of total sentences that have to be used for training and testing. Sentences will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from sentiment_analyzer import SentimentAnalyzer from nltk.corpus import subjectivity if n_instances is not None: n_instances = int(n_instances/2) subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_subj_docs, test_subj_docs = split_train_test(subj_docs) train_obj_docs, test_obj_docs = split_train_test(obj_docs) training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) # Add simple unigram word features handling negation unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if save_analyzer == True: save_file(sentim_analyzer, 'sa_subjectivity.pickle') if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__, Tokenizer='WhitespaceTokenizer', Feats=extr, Instances=n_instances, Results=results) return sentim_analyzer
Example #30
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from sentiment_analyzer import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances/2) pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]] neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs+train_neg_docs testing_docs = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__, Tokenizer='WordPunctTokenizer', Feats=extr, Results=results, Instances=n_instances)