Python nltk.classify.NaiveBayesClassifier.train() Examples

The following are 30 code examples of nltk.classify.NaiveBayesClassifier.train(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.classify.NaiveBayesClassifier , or try the search function .
Example #1
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def split_train_test(all_instances, n=None):
    """
    Randomly split `n` instances of the dataset into train and test sets.

    :param all_instances: a list of instances (e.g. documents) that will be split.
    :param n: the number of instances to consider (in case we want to use only a
        subset).
    :return: two lists of instances. Train set is 8/10 of the total and test set
        is 2/10 of the total.
    """
    random.seed(12345)
    random.shuffle(all_instances)
    if not n or n > len(all_instances):
        n = len(all_instances)
    train_set = all_instances[:int(.8*n)]
    test_set = all_instances[int(.8*n):n]

    return train_set, test_set 
Example #2
Source File: util.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp

    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text)) 
Example #3
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def demo_sent_subjectivity(text):
    """
    Classify a single sentence as subjective or objective using a stored
    SentimentAnalyzer.

    :param text: a sentence whose subjectivity has to be classified.
    """
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import regexp
    word_tokenizer = regexp.WhitespaceTokenizer()
    try:
        sentim_analyzer = load('sa_subjectivity.pickle')
    except LookupError:
        print('Cannot find the sentiment analyzer you want to load.')
        print('Training a new one using NaiveBayesClassifier.')
        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    # Tokenize and convert to lower case
    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
    print(sentim_analyzer.classify(tokenized_text)) 
Example #4
Source File: util.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def split_train_test(all_instances, n=None):
    """
    Randomly split `n` instances of the dataset into train and test sets.

    :param all_instances: a list of instances (e.g. documents) that will be split.
    :param n: the number of instances to consider (in case we want to use only a
        subset).
    :return: two lists of instances. Train set is 8/10 of the total and test set
        is 2/10 of the total.
    """
    random.seed(12345)
    random.shuffle(all_instances)
    if not n or n > len(all_instances):
        n = len(all_instances)
    train_set = all_instances[: int(0.8 * n)]
    test_set = all_instances[int(0.8 * n) : n]

    return train_set, test_set 
Example #5
Source File: sequential.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def __init__(
        self,
        train=None,
        model=None,
        affix_length=-3,
        min_stem_length=2,
        backoff=None,
        cutoff=0,
        verbose=False,
    ):

        self._check_params(train, model)

        ContextTagger.__init__(self, model, backoff)

        self._affix_length = affix_length
        self._min_word_length = min_stem_length + abs(affix_length)

        if train:
            self._train(train, cutoff, verbose) 
Example #6
Source File: sequential.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, train=None, model=None,
                 backoff=None, cutoff=0, verbose=False):
        NgramTagger.__init__(self, 1, train, model,
                             backoff, cutoff, verbose) 
Example #7
Source File: nonprojectivedependencyparser.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def nonprojective_conll_parse_demo():
    from nltk.parse.dependencygraph import conll_data2

    graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry]
    npp = ProbabilisticNonprojectiveParser()
    npp.train(graphs, NaiveBayesDependencyScorer())
    for parse_graph in npp.parse(
        ['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']
    ):
        print(parse_graph) 
Example #8
Source File: nonprojectivedependencyparser.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def hall_demo():
    npp = ProbabilisticNonprojectiveParser()
    npp.train([], DemoScorer())
    for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]):
        print(parse_graph) 
Example #9
Source File: nonprojectivedependencyparser.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def train(self, graphs):
        print('Training...') 
Example #10
Source File: nonprojectivedependencyparser.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def train(self, graphs):
        """
        Trains a ``NaiveBayesClassifier`` using the edges present in
        graphs list as positive examples, the edges not present as
        negative examples.  Uses a feature vector of head-word,
        head-tag, child-word, and child-tag.

        :type graphs: list(DependencyGraph)
        :param graphs: A list of dependency graphs to train the scorer.
        """

        from nltk.classify import NaiveBayesClassifier

        # Create training labeled training examples
        labeled_examples = []
        for graph in graphs:
            for head_node in graph.nodes.values():
                for child_index, child_node in graph.nodes.items():
                    if child_index in head_node['deps']:
                        label = "T"
                    else:
                        label = "F"
                    labeled_examples.append(
                        (
                            dict(
                                a=head_node['word'],
                                b=head_node['tag'],
                                c=child_node['word'],
                                d=child_node['tag'],
                            ),
                            label,
                        )
                    )

        self.classifier = NaiveBayesClassifier.train(labeled_examples) 
Example #11
Source File: nonprojectivedependencyparser.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def train(self, graphs):
        """
        :type graphs: list(DependencyGraph)
        :param graphs: A list of dependency graphs to train the scorer.
        Typically the edges present in the graphs can be used as
        positive training examples, and the edges not present as negative
        examples.
        """
        raise NotImplementedError() 
Example #12
Source File: sequential.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
        NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose) 
Example #13
Source File: sequential.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
        NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose) 
Example #14
Source File: sequential.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False):
        NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose) 
Example #15
Source File: sequential.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(
        self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False
    ):
        self._n = n
        self._check_params(train, model)

        ContextTagger.__init__(self, model, backoff)

        if train:
            self._train(train, cutoff, verbose) 
Example #16
Source File: supervised.py    From indosum with Apache License 2.0 5 votes vote down vote up
def train(cls, tagged_vecs: Collection[Tuple[np.ndarray, int]]) -> '_GaussianEmission':
        by_tag: dict = defaultdict(list)
        for vec, tag in tagged_vecs:
            by_tag[tag].append(vec)

        mean_dict = {}
        matrices = []
        for tag, vecs in by_tag.items():
            mean = mean_dict[tag] = np.mean(vecs, axis=0)
            for vec in vecs:
                v = (vec - mean).reshape(-1, 1)
                matrices.append(v.dot(v.T))
        cov = np.mean(matrices, axis=0)
        return cls(mean_dict, cov) 
Example #17
Source File: supervised.py    From indosum with Apache License 2.0 5 votes vote down vote up
def train(cls,
              docs: Collection[Document],
              cutoff: float = 0.1,
              idf_table: Optional[Mapping[Word, float]] = None,
              ) -> 'NaiveBayesSummarizer':
        """Train the model on a collection of documents.

        Args:
            docs (Collection[Document]): The collection of documents to train on.
            cutoff (float): Cutoff for signature words.
            idf_table (Mapping[Word, float]): Precomputed IDF table. If not given, the IDF
                will be computed from ``docs``.

        Returns:
            NaiveBayes: The trained model.
        """
        # Find signature words
        idf = cls._compute_idf(docs) if idf_table is None else idf_table
        n_cutoff = int(cutoff * len(idf))
        signature_words = set(sorted(
            idf.keys(), key=lambda w: idf[w], reverse=True)[:n_cutoff])

        train_data = []  # type: list
        for doc in docs:
            featuresets = cls._extract_featuresets(doc, signature_words)
            labels = [sent.label for sent in doc.sentences]
            train_data.extend(zip(featuresets, labels))
        return cls(
            NaiveBayesClassifier.train(train_data), signature_words=signature_words) 
Example #18
Source File: nonprojectivedependencyparser.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def nonprojective_conll_parse_demo():
    from nltk.parse.dependencygraph import conll_data2

    graphs = [
        DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry
    ]
    npp = ProbabilisticNonprojectiveParser()
    npp.train(graphs, NaiveBayesDependencyScorer())
    for parse_graph in npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']):
        print(parse_graph) 
Example #19
Source File: nonprojectivedependencyparser.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def hall_demo():
    npp = ProbabilisticNonprojectiveParser()
    npp.train([], DemoScorer())
    for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]):
        print(parse_graph) 
Example #20
Source File: nonprojectivedependencyparser.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def train(self, graphs, dependency_scorer):
        """
        Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects,
        and establishes this as the parser's scorer.  This is used to
        initialize the scores on a ``DependencyGraph`` during the parsing
        procedure.

        :type graphs: list(DependencyGraph)
        :param graphs: A list of dependency graphs to train the scorer.
        :type dependency_scorer: DependencyScorerI
        :param dependency_scorer: A scorer which implements the
            ``DependencyScorerI`` interface.
        """
        self._scorer = dependency_scorer
        self._scorer.train(graphs) 
Example #21
Source File: nonprojectivedependencyparser.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def train(self, graphs):
        print('Training...') 
Example #22
Source File: nonprojectivedependencyparser.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def train(self, graphs):
        """
        :type graphs: list(DependencyGraph)
        :param graphs: A list of dependency graphs to train the scorer.
        Typically the edges present in the graphs can be used as
        positive training examples, and the edges not present as negative
        examples.
        """
        raise NotImplementedError() 
Example #23
Source File: sequential.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, train=None, model=None, affix_length=-3,
                 min_stem_length=2, backoff=None, cutoff=0, verbose=False):

        self._check_params(train, model)

        ContextTagger.__init__(self, model, backoff)

        self._affix_length = affix_length
        self._min_word_length = min_stem_length + abs(affix_length)

        if train:
            self._train(train, cutoff, verbose) 
Example #24
Source File: sequential.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, train=None, model=None,
                 backoff=None, cutoff=0, verbose=False):
        NgramTagger.__init__(self, 3, train, model,
                             backoff, cutoff, verbose) 
Example #25
Source File: sequential.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, train=None, model=None,
                 backoff=None, cutoff=0, verbose=False):
        NgramTagger.__init__(self, 2, train, model,
                             backoff, cutoff, verbose) 
Example #26
Source File: sequential.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, n, train=None, model=None,
                 backoff=None, cutoff=0, verbose=False):
        self._n = n
        self._check_params(train, model)

        ContextTagger.__init__(self, model, backoff)

        if train:
            self._train(train, cutoff, verbose) 
Example #27
Source File: supervised.py    From indosum with Apache License 2.0 4 votes vote down vote up
def train(cls,
              docs: Collection[Document],
              stopwords: Optional[Collection[Word]] = None,
              algorithm: str = 'iis',
              cutoff: int = 4,
              sigma: float = 0.,
              trim_length: int = 10,
              ) -> 'MaxentSummarizer':
        """Train the model on a collection of documents.

        Args:
            docs (Collection[Document]): The collection of documents to train on.
            stopwords (Collection[Word]): Collection of stopwords.
            algorithm (str): Optimization algorithm for training. Possible values are 'iis',
                'gis', or 'megam' (requires `megam`_ to be installed).
            cutoff (int): Features that occur fewer than this value in the training data will
                be discarded.
            sigma (float): Standard deviation for the Gaussian prior. Default is no prior.
            trim_length (int): Trim words to this length.

        Returns:
            MaxEntropy: The trained model.

        .. _megam: https://www.umiacs.umd.edu/~hal/megam/
        """
        if stopwords is None:
            stopwords = set()

        word_pairs = {pair for doc in docs for sent in doc.sentences
                      for pair in cls._get_word_pairs(sent, stopwords, trim_len=trim_length)}

        train_data: list = []
        for doc in docs:
            featuresets = cls._extract_featuresets(doc, stopwords, word_pairs, trim_length)
            labels = [sent.label for sent in doc.sentences]
            train_data.extend(zip(featuresets, labels))

        encoding = BinaryMaxentFeatureEncoding.train(
            train_data, count_cutoff=cutoff, alwayson_features=True)
        classifier = MaxentClassifier.train(
            train_data, algorithm=algorithm, encoding=encoding, gaussian_prior_sigma=sigma)
        return cls(classifier, stopwords=stopwords, word_pairs=word_pairs) 
Example #28
Source File: supervised.py    From indosum with Apache License 2.0 4 votes vote down vote up
def train(cls,
              docs: Collection[Document],
              gamma_word: float = 0.1,
              gamma_init: float = 0.1,
              gamma_trans: float = 0.1,
              tf_table: Optional[Mapping[Word, float]] = None,
              ) -> 'HMMSummarizer':
        """Train the model on a collection of documents.

        Args:
            docs (Collection[Document]): The collection of documents to train on.
            gamma_word (float): Smoothing value for the "word probability in a document"
                feature.
            gamma_init (float): Smoothing value for the initial probability.
            gamma_trans (float): Smoothing value for the transition probability.
            tf_table (Mapping[Word, float]): A precomputed term-frequency table that is already
                normalized.

        Returns:
            HMM: The trained model.
        """
        init_fdist = FreqDist()
        trans_fdist = ConditionalFreqDist()
        tagged_vecs: list = []
        states = set()

        for doc in docs:
            tags = cls._get_tags(doc.sentences)
            if not tags:
                continue

            init_fdist[tags[0]] += 1
            for prev, tag in zip(tags, tags[1:]):
                trans_fdist[prev][tag] += 1
            vecs = cls._get_feature_vectors(doc, gamma_word, tf=tf_table)
            tagged_vecs.extend(zip(vecs, tags))
            states.update(tags)

        # Initial probability
        init_pdist = LidstoneProbDist(init_fdist, gamma_init, bins=len(states))
        # Transition probability
        trans_pdist = ConditionalProbDist(
            trans_fdist, LidstoneProbDist, gamma_trans, bins=len(states))
        # Emission probability
        emit_pdist = _GaussianEmission.train(tagged_vecs)
        return cls(
            init_pdist, trans_pdist, emit_pdist, list(states), gamma=gamma_word,
            tf_table=tf_table) 
Example #29
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 4 votes vote down vote up
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from sentiment_analyzer import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
Example #30
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 4 votes vote down vote up
def demo_movie_reviews(trainer, n_instances=None, output=None):
    """
    Train classifier on all instances of the Movie Reviews dataset.
    The corpus has been preprocessed using the default sentence tokenizer and
    WordPunctTokenizer.
    Features are composed of:
        - most frequent unigrams

    :param trainer: `train` method of a classifier.
    :param n_instances: the number of total reviews that have to be used for
        training and testing. Reviews will be equally split between positive and
        negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.corpus import movie_reviews
    from sentiment_analyzer import SentimentAnalyzer

    if n_instances is not None:
        n_instances = int(n_instances/2)

    pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]]
    neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]]
    # We separately split positive and negative instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
    train_neg_docs, test_neg_docs = split_train_test(neg_docs)

    training_docs = train_pos_docs+train_neg_docs
    testing_docs = test_pos_docs+test_neg_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words = sentim_analyzer.all_words(training_docs)

    # Add simple unigram word features
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__,
                        Tokenizer='WordPunctTokenizer', Feats=extr, Results=results,
                        Instances=n_instances)