Python Examples of sklearn.feature_extraction.DictVectorizer

Source File: learn.py From partisan-discourse with Apache License 2.0

6 votes

def transform(self, documents):
        """
        Returns a dictionary of text features in advance of a DictVectorizer.
        """
        for document in documents:
            # Collect token and vocabulary counts
            counts = Counter(
                item[0] for para in document for sent in para for item in sent
            )

            # Yield structured information about the document
            yield {
                'paragraphs': len(document),
                'sentences': sum(len(para) for para in document),
                'words': sum(counts.values()),
                'vocab': len(counts),
            }


##########################################################################
## Model Building Functions
##########################################################################

Source File: feature_extractors.py From StrepHit with GNU General Public License v3.0

6 votes

def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
        """ Initializes the extractor.

            :param language: The language of the sentences that will be used
            :param window_width: how many tokens to look before and after a each
             token when building its features.
            :param collapse_fes: Whether to collapse FEs to a single token
             or to keep them split.
        """
        self.language = language
        self.tagger = TTPosTagger(language)
        self.window_width = window_width
        self.collapse_fes = collapse_fes
        self.unk_feature = 'UNK'
        self.vectorizer = DictVectorizer()
        self.target_size = target_size
        self.reducer = TruncatedSVD(target_size) if target_size else None
        self.vocabulary = set()
        self.label_index = {}
        self.lu_index = {}
        self.stopwords = set(w.lower() for w in StopWords().words(language))
        self.start()

Source File: test_sklearn_dict_vectorizer_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_dict_vectorizer(self):
        model = DictVectorizer()
        data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}]
        model.fit_transform(data)
        model_onnx = convert_sklearn(
            model, "dictionary vectorizer",
            [(
                "input",
                DictionaryType(StringTensorType([1]), FloatTensorType([1])),
            )])
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data, model, model_onnx,
            basename="SklearnDictVectorizer-OneOff-SkipDim1",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.1.3') or "
                          "StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3.0')")

Source File: test_sklearn_dict_vectorizer_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_dict_vectorizer_sort_false(self):
        model = DictVectorizer(sparse=False, sort=False)
        data = [{1: 1.0, 2: 200.0}, {1: 3.0, 3: 1.0}]
        model.fit_transform(data)
        model_onnx = convert_sklearn(
            model,
            "dictionary vectorizer",
            [(
                "input",
                DictionaryType(Int64TensorType([1]), FloatTensorType([1])),
            )],
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            data,
            model,
            model_onnx,
            basename="SklearnDictVectorizerSortFalse-OneOff-SkipDim1",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.1.3') or "
                          "StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3.0')",
        )

Source File: predictor.py From auto_ml with MIT License

6 votes

def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
        # First, restrict our DictVectorizer or DataFrameVectorizer
        # This goes through and has DV only output the items that have passed our support mask
        # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
        # It also significantly reduces the size of dv.vocabulary_ which can get quite large

        try:
            feature_selection = transformation_pipeline.named_steps['feature_selection']
            feature_selection_mask = feature_selection.support_mask
            transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
        except KeyError:
            pass

        # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
        # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
        trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)

        return trained_pipeline_without_feature_selection

Source File: test_feature_extraction.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer)
        self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher)

        self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph)
        self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph)
        self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d)
        self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d,
                      fe.image.reconstruct_from_patches_2d)
        self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor)

        self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer)
        self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer)
        self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer)
        self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer)

Source File: scikitlearn.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

Source File: scikitlearn.py From razzy-spinner with GNU General Public License v3.0

6 votes

def __init__(self, estimator, dtype=float, sparse=True):
        """
        :param estimator: scikit-learn classifier object.

        :param dtype: data type used when building feature array.
            scikit-learn estimators work exclusively on numeric data. The
            default value should be fine for almost all situations.

        :param sparse: Whether to use sparse matrices internally.
            The estimator must support these; not all scikit-learn classifiers
            do (see their respective documentation and look for "sparse
            matrix"). The default value is True, since most NLP problems
            involve sparse feature sets. Setting this to False may take a
            great amount of memory.
        :type sparse: boolean.
        """
        self._clf = estimator
        self._encoder = LabelEncoder()
        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)

Source File: pico_robot.py From robotreviewer with GNU General Public License v3.0

6 votes

def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank()

Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_dictvectorizer(sparse, dtype, sort, iterable):
    D = [{"foo": 1, "bar": 3},
         {"bar": 4, "baz": 2},
         {"bar": 1, "quux": 1, "quuux": 2}]

    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
    X = v.fit_transform(iter(D) if iterable else D)

    assert_equal(sp.issparse(X), sparse)
    assert_equal(X.shape, (3, 5))
    assert_equal(X.sum(), 14)
    assert_equal(v.inverse_transform(X), D)

    if sparse:
        # CSR matrices can't be compared for equality
        assert_array_equal(X.A, v.transform(iter(D) if iterable
                                            else D).A)
    else:
        assert_array_equal(X, v.transform(iter(D) if iterable
                                          else D))

    if sort:
        assert_equal(v.feature_names_,
                     sorted(v.feature_names_))

Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e))

Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License

6 votes

def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e))

Source File: test_dict_vectorizer.py From sparkit-learn with Apache License 2.0

5 votes

def test_same_output_dense(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=False)
        dist = SparkDictVectorizer(sparse=False)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (np.ndarray,)))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist.toarray())

Source File: test_cml_DictVectorizerConverter.py From onnxmltools with MIT License

5 votes

def test_dict_vectorizer(self):
        model = DictVectorizer()
        data = [{'amy': 1., 'chin': 200.}, {'nice': 3., 'amy': 1.}]
        model.fit_transform(data)
        model_coreml = coremltools.converters.sklearn.convert(model)
        model_onnx = convert(model_coreml.get_spec())
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(data, model, model_onnx, basename="CmlDictVectorizer-OneOff-SkipDim1",
                            allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')")

Source File: test_sklearn_dict_vectorizer_converter.py From sklearn-onnx with MIT License

5 votes

def test_model_dict_vectorizer_issue(self):
        key_value_map = [{1: 'A', 2: 'B'}, {1: 'C', 3: 'D'},
                         {1: 'C', 3: 'A'}]
        model = DictVectorizer(sparse=False).fit(key_value_map)
        with self.assertRaises(RuntimeError):
            convert_sklearn(
                model, 'dv',
                [("input", DictionaryType(Int64TensorType([1]),
                  StringTensorType([1])))])

Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License

5 votes

def test_deterministic_vocabulary():
    # Generate equal dictionaries with different memory layouts
    items = [("%03d" % i, i) for i in range(1000)]
    rng = Random(42)
    d_sorted = dict(items)
    rng.shuffle(items)
    d_shuffled = dict(items)

    # check that the memory layout does not impact the resulting vocabulary
    v_1 = DictVectorizer().fit([d_sorted])
    v_2 = DictVectorizer().fit([d_shuffled])

    assert_equal(v_1.vocabulary_, v_2.vocabulary_)

Source File: text_models.py From mindmeld with Apache License 2.0

5 votes

def __init__(self, config):
        super().__init__(config)
        self._class_encoder = SKLabelEncoder()
        self._feat_vectorizer = DictVectorizer()
        self._feat_selector = self._get_feature_selector()
        self._feat_scaler = self._get_feature_scaler()
        self._meta_type = None
        self._meta_feat_vectorizer = DictVectorizer(sparse=False)
        self._base_clfs = {}
        self.cv_loss_ = None
        self.train_acc_ = None

Source File: memm.py From mindmeld with Apache License 2.0

5 votes

def setup_model(self, config):
        if config.model_settings is None:
            selector_type = None
            scale_type = None
        else:
            selector_type = config.model_settings.get("feature_selector")
            scale_type = config.model_settings.get("feature_scaler")
        self.class_encoder = SKLabelEncoder()
        self.feat_vectorizer = DictVectorizer()
        self._feat_selector = self._get_feature_selector(selector_type)
        self._feat_scaler = self._get_feature_scaler(scale_type)

Source File: test_dict_vectorizer.py From sparkit-learn with Apache License 2.0

5 votes

def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())

Source File: transform_features.py From fake-news-detection with MIT License

5 votes

def get_feature_transformer(parser, run_grammar=True, run_tfidf=True):
    '''
    Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures.
    Suitable for use as a step in a SKLearn Pipeline.

    inputs:
        parser: a Spacy pipeline object
    returns:
        feature transformer: FeatureUnion
    '''
    tfidf = Pipeline([
            ('cln', CleanTextTransformer()),
            ('pre', PreTokenizer(parser=parser)),
            ('vect', TfidfVectorizer(
                         max_features=3000, decode_error='replace')),
            ('clf', None)
        ])
    grammar_counter = Pipeline([
            ('cln', CleanTextTransformer()),
            ('grm', GrammarTransformer(parser=parser)),
            ('to_dict', DictVectorizer()),
            ('clf', None)
        ])
    if run_grammar and run_tfidf:
        print('Running both feature sets.')
        feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)])
    elif not run_grammar:
        print('Running only TFIDF.')
        feature_transformer = FeatureUnion([("tfidf", tfidf)])
    elif not run_tfidf:
        print('Running only PCFGs.')
        feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)])
    return feature_transformer

Source File: lexrankr.py From lexrankr with MIT License

5 votes

def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs):
        self.decay_window = decay_window
        self.decay_alpha = decay_alpha
        if similarity == 'cosine':  # very, very slow :(
            self.vectorizer = DictVectorizer()
            self.uniform_sim = self._sim_cosine
        elif similarity == 'jaccard':
            self.uniform_sim = self._sim_jaccard
        elif similarity == 'normalized_cooccurrence':
            self.uniform_sim = self._sim_normalized_cooccurrence
        else:
            raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence")
        self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
        self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs)
        if clustering == 'birch':
            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix)
        elif clustering == 'dbscan':
            self._dbscan = DBSCAN()
            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix)
        elif clustering == 'affinity':
            self._affinity = AffinityPropagation()
            self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix)
        elif clustering is None:
            self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])]
        else:
            raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)")
        self.no_below_word_count = no_below_word_count
        self.no_above_word_portion = no_above_word_portion
        self.max_dictionary_size = max_dictionary_size
        self.similarity_threshold = similarity_threshold
        self.min_cluster_size = min_cluster_size
        self.matrix_smoothing = matrix_smoothing
        self.compactify = compactify

Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License

5 votes

def test_dictvectorizer():
    D = [{"foo": 1, "bar": 3},
         {"bar": 4, "baz": 2},
         {"bar": 1, "quux": 1, "quuux": 2}]

    for sparse in (True, False):
        for dtype in (int, np.float32, np.int16):
            for sort in (True, False):
                for iterable in (True, False):
                    v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
                    X = v.fit_transform(iter(D) if iterable else D)

                    assert_equal(sp.issparse(X), sparse)
                    assert_equal(X.shape, (3, 5))
                    assert_equal(X.sum(), 14)
                    assert_equal(v.inverse_transform(X), D)

                    if sparse:
                        # CSR matrices can't be compared for equality
                        assert_array_equal(X.A, v.transform(iter(D) if iterable
                                                            else D).A)
                    else:
                        assert_array_equal(X, v.transform(iter(D) if iterable
                                                          else D))

                    if sort:
                        assert_equal(v.feature_names_,
                                     sorted(v.feature_names_))

Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License

5 votes

def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"])

Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License

5 votes

def test_one_of_k():
    D_in = [{"version": "1", "ham": 2},
            {"version": "2", "spam": .3},
            {"version=3": True, "spam": -1}]
    v = DictVectorizer()
    X = v.fit_transform(D_in)
    assert_equal(X.shape, (3, 5))

    D_out = v.inverse_transform(X)
    assert_equal(D_out[0], {"version=1": 1, "ham": 2})

    names = v.get_feature_names()
    assert_true("version=2" in names)
    assert_false("version" in names)

Source File: custom_transformers.py From pandas-pipelines-custom-transformers with MIT License

5 votes

def fit(self, X, y=None):
        # assumes all columns of X are strings
        Xdict = X.to_dict('records')
        self.dv = DictVectorizer(sparse=False)
        self.dv.fit(Xdict)
        return self

Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"])

Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_one_of_k():
    D_in = [{"version": "1", "ham": 2},
            {"version": "2", "spam": .3},
            {"version=3": True, "spam": -1}]
    v = DictVectorizer()
    X = v.fit_transform(D_in)
    assert_equal(X.shape, (3, 5))

    D_out = v.inverse_transform(X)
    assert_equal(D_out[0], {"version=1": 1, "ham": 2})

    names = v.get_feature_names()
    assert "version=2" in names
    assert "version" not in names

Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_deterministic_vocabulary():
    # Generate equal dictionaries with different memory layouts
    items = [("%03d" % i, i) for i in range(1000)]
    rng = Random(42)
    d_sorted = dict(items)
    rng.shuffle(items)
    d_shuffled = dict(items)

    # check that the memory layout does not impact the resulting vocabulary
    v_1 = DictVectorizer().fit([d_sorted])
    v_2 = DictVectorizer().fit([d_shuffled])

    assert_equal(v_1.vocabulary_, v_2.vocabulary_)

Source File: function_clustering.py From Firmware_Slap with GNU General Public License v3.0

5 votes

def funcs_to_sparse(func_list):
    vectorizor = DictVectorizer()
    func_sparse = vectorizor.fit_transform(func_list)
    return vectorizor, func_sparse

Source File: pipeline.py From whereami with GNU Affero General Public License v3.0

5 votes

def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
    return make_pipeline(DictVectorizer(sparse=False), clf)

Python sklearn.feature_extraction.DictVectorizer() Examples