Python sklearn.feature_extraction.DictVectorizer() Examples
The following are 30
code examples of sklearn.feature_extraction.DictVectorizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction
, or try the search function
.
Example #1
Source File: learn.py From partisan-discourse with Apache License 2.0 | 6 votes |
def transform(self, documents): """ Returns a dictionary of text features in advance of a DictVectorizer. """ for document in documents: # Collect token and vocabulary counts counts = Counter( item[0] for para in document for sent in para for item in sent ) # Yield structured information about the document yield { 'paragraphs': len(document), 'sentences': sum(len(para) for para in document), 'words': sum(counts.values()), 'vocab': len(counts), } ########################################################################## ## Model Building Functions ##########################################################################
Example #2
Source File: feature_extractors.py From StrepHit with GNU General Public License v3.0 | 6 votes |
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None): """ Initializes the extractor. :param language: The language of the sentences that will be used :param window_width: how many tokens to look before and after a each token when building its features. :param collapse_fes: Whether to collapse FEs to a single token or to keep them split. """ self.language = language self.tagger = TTPosTagger(language) self.window_width = window_width self.collapse_fes = collapse_fes self.unk_feature = 'UNK' self.vectorizer = DictVectorizer() self.target_size = target_size self.reducer = TruncatedSVD(target_size) if target_size else None self.vocabulary = set() self.label_index = {} self.lu_index = {} self.stopwords = set(w.lower() for w in StopWords().words(language)) self.start()
Example #3
Source File: test_sklearn_dict_vectorizer_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_model_dict_vectorizer(self): model = DictVectorizer() data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}] model.fit_transform(data) model_onnx = convert_sklearn( model, "dictionary vectorizer", [( "input", DictionaryType(StringTensorType([1]), FloatTensorType([1])), )]) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnDictVectorizer-OneOff-SkipDim1", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.1.3') or " "StrictVersion(onnx.__version__)" " < StrictVersion('1.3.0')")
Example #4
Source File: test_sklearn_dict_vectorizer_converter.py From sklearn-onnx with MIT License | 6 votes |
def test_model_dict_vectorizer_sort_false(self): model = DictVectorizer(sparse=False, sort=False) data = [{1: 1.0, 2: 200.0}, {1: 3.0, 3: 1.0}] model.fit_transform(data) model_onnx = convert_sklearn( model, "dictionary vectorizer", [( "input", DictionaryType(Int64TensorType([1]), FloatTensorType([1])), )], ) self.assertTrue(model_onnx is not None) dump_data_and_model( data, model, model_onnx, basename="SklearnDictVectorizerSortFalse-OneOff-SkipDim1", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.1.3') or " "StrictVersion(onnx.__version__)" " < StrictVersion('1.3.0')", )
Example #5
Source File: predictor.py From auto_ml with MIT License | 6 votes |
def _consolidate_pipeline(self, transformation_pipeline, final_model=None): # First, restrict our DictVectorizer or DataFrameVectorizer # This goes through and has DV only output the items that have passed our support mask # This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step # It also significantly reduces the size of dv.vocabulary_ which can get quite large try: feature_selection = transformation_pipeline.named_steps['feature_selection'] feature_selection_mask = feature_selection.support_mask transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask) except KeyError: pass # We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline # In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model) return trained_pipeline_without_feature_selection
Example #6
Source File: test_feature_extraction.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.feature_extraction.DictVectorizer, fe.DictVectorizer) self.assertIs(df.feature_extraction.FeatureHasher, fe.FeatureHasher) self.assertIs(df.feature_extraction.image.img_to_graph, fe.image.img_to_graph) self.assertIs(df.feature_extraction.image.grid_to_graph, fe.image.grid_to_graph) self.assertIs(df.feature_extraction.image.extract_patches_2d, fe.image.extract_patches_2d) self.assertIs(df.feature_extraction.image.reconstruct_from_patches_2d, fe.image.reconstruct_from_patches_2d) self.assertIs(df.feature_extraction.image.PatchExtractor, fe.image.PatchExtractor) self.assertIs(df.feature_extraction.text.CountVectorizer, fe.text.CountVectorizer) self.assertIs(df.feature_extraction.text.HashingVectorizer, fe.text.HashingVectorizer) self.assertIs(df.feature_extraction.text.TfidfTransformer, fe.text.TfidfTransformer) self.assertIs(df.feature_extraction.text.TfidfVectorizer, fe.text.TfidfVectorizer)
Example #7
Source File: scikitlearn.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
Example #8
Source File: scikitlearn.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
Example #9
Source File: pico_robot.py From robotreviewer with GNU General Public License v3.0 | 6 votes |
def __init__(self): self.vectorizer = HashingVectorizer(ngram_range=(1, 2)) self.dict_vectorizer = DictVectorizer() # These are set dynamically in training # but fixed here to match the end feature names # in the trained model. If the model is retrained then # these may have to change self.dict_vectorizer.feature_names_ = [ 'DocumentPositionQuintile0', 'DocumentPositionQuintile1', 'DocumentPositionQuintile2', 'DocumentPositionQuintile3', 'DocumentPositionQuintile4', 'DocumentPositionQuintile5', 'DocumentPositionQuintile6'] self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)} self.drugbank = Drugbank()
Example #10
Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_dictvectorizer(sparse, dtype, sort, iterable): D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}] v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) X = v.fit_transform(iter(D) if iterable else D) assert_equal(sp.issparse(X), sparse) assert_equal(X.shape, (3, 5)) assert_equal(X.sum(), 14) assert_equal(v.inverse_transform(X), D) if sparse: # CSR matrices can't be compared for equality assert_array_equal(X.A, v.transform(iter(D) if iterable else D).A) else: assert_array_equal(X, v.transform(iter(D) if iterable else D)) if sort: assert_equal(v.feature_names_, sorted(v.feature_names_))
Example #11
Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_unseen_or_no_features(): D = [{"camelot": 0, "spamalot": 1}] for sparse in [True, False]: v = DictVectorizer(sparse=sparse).fit(D) X = v.transform({"push the pram a lot": 2}) if sparse: X = X.toarray() assert_array_equal(X, np.zeros((1, 2))) X = v.transform({}) if sparse: X = X.toarray() assert_array_equal(X, np.zeros((1, 2))) try: v.transform([]) except ValueError as e: assert_in("empty", str(e))
Example #12
Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_unseen_or_no_features(): D = [{"camelot": 0, "spamalot": 1}] for sparse in [True, False]: v = DictVectorizer(sparse=sparse).fit(D) X = v.transform({"push the pram a lot": 2}) if sparse: X = X.toarray() assert_array_equal(X, np.zeros((1, 2))) X = v.transform({}) if sparse: X = X.toarray() assert_array_equal(X, np.zeros((1, 2))) try: v.transform([]) except ValueError as e: assert_in("empty", str(e))
Example #13
Source File: test_dict_vectorizer.py From sparkit-learn with Apache License 2.0 | 5 votes |
def test_same_output_dense(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=False) dist = SparkDictVectorizer(sparse=False) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (np.ndarray,))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist.toarray())
Example #14
Source File: test_cml_DictVectorizerConverter.py From onnxmltools with MIT License | 5 votes |
def test_dict_vectorizer(self): model = DictVectorizer() data = [{'amy': 1., 'chin': 200.}, {'nice': 3., 'amy': 1.}] model.fit_transform(data) model_coreml = coremltools.converters.sklearn.convert(model) model_onnx = convert(model_coreml.get_spec()) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="CmlDictVectorizer-OneOff-SkipDim1", allow_failure="StrictVersion(onnx.__version__) < StrictVersion('1.3.0')")
Example #15
Source File: test_sklearn_dict_vectorizer_converter.py From sklearn-onnx with MIT License | 5 votes |
def test_model_dict_vectorizer_issue(self): key_value_map = [{1: 'A', 2: 'B'}, {1: 'C', 3: 'D'}, {1: 'C', 3: 'A'}] model = DictVectorizer(sparse=False).fit(key_value_map) with self.assertRaises(RuntimeError): convert_sklearn( model, 'dv', [("input", DictionaryType(Int64TensorType([1]), StringTensorType([1])))])
Example #16
Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_deterministic_vocabulary(): # Generate equal dictionaries with different memory layouts items = [("%03d" % i, i) for i in range(1000)] rng = Random(42) d_sorted = dict(items) rng.shuffle(items) d_shuffled = dict(items) # check that the memory layout does not impact the resulting vocabulary v_1 = DictVectorizer().fit([d_sorted]) v_2 = DictVectorizer().fit([d_shuffled]) assert_equal(v_1.vocabulary_, v_2.vocabulary_)
Example #17
Source File: text_models.py From mindmeld with Apache License 2.0 | 5 votes |
def __init__(self, config): super().__init__(config) self._class_encoder = SKLabelEncoder() self._feat_vectorizer = DictVectorizer() self._feat_selector = self._get_feature_selector() self._feat_scaler = self._get_feature_scaler() self._meta_type = None self._meta_feat_vectorizer = DictVectorizer(sparse=False) self._base_clfs = {} self.cv_loss_ = None self.train_acc_ = None
Example #18
Source File: memm.py From mindmeld with Apache License 2.0 | 5 votes |
def setup_model(self, config): if config.model_settings is None: selector_type = None scale_type = None else: selector_type = config.model_settings.get("feature_selector") scale_type = config.model_settings.get("feature_scaler") self.class_encoder = SKLabelEncoder() self.feat_vectorizer = DictVectorizer() self._feat_selector = self._get_feature_selector(selector_type) self._feat_scaler = self._get_feature_scaler(scale_type)
Example #19
Source File: test_dict_vectorizer.py From sparkit-learn with Apache License 2.0 | 5 votes |
def test_same_output_sparse(self): X, X_rdd = self.make_dict_dataset() local = DictVectorizer(sparse=True) dist = SparkDictVectorizer(sparse=True) result_local = local.fit_transform(X) result_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,))) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray())
Example #20
Source File: transform_features.py From fake-news-detection with MIT License | 5 votes |
def get_feature_transformer(parser, run_grammar=True, run_tfidf=True): ''' Creates a transformer object that will take a text series and generate TFIDF counts and frequency of syntactical structures. Suitable for use as a step in a SKLearn Pipeline. inputs: parser: a Spacy pipeline object returns: feature transformer: FeatureUnion ''' tfidf = Pipeline([ ('cln', CleanTextTransformer()), ('pre', PreTokenizer(parser=parser)), ('vect', TfidfVectorizer( max_features=3000, decode_error='replace')), ('clf', None) ]) grammar_counter = Pipeline([ ('cln', CleanTextTransformer()), ('grm', GrammarTransformer(parser=parser)), ('to_dict', DictVectorizer()), ('clf', None) ]) if run_grammar and run_tfidf: print('Running both feature sets.') feature_transformer = FeatureUnion([("tfidf", tfidf), ('grammar_counter', grammar_counter)]) elif not run_grammar: print('Running only TFIDF.') feature_transformer = FeatureUnion([("tfidf", tfidf)]) elif not run_tfidf: print('Running only PCFGs.') feature_transformer = FeatureUnion([('grammar_counter', grammar_counter)]) return feature_transformer
Example #21
Source File: lexrankr.py From lexrankr with MIT License | 5 votes |
def __init__(self, similarity='cosine', decay_window=20, decay_alpha=0.25, clustering='dbscan', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb', 'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix', 'Alpha', 'Number'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=2, no_above_word_portion=0.85, max_dictionary_size=None, min_cluster_size=2, similarity_threshold=0.85, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs): self.decay_window = decay_window self.decay_alpha = decay_alpha if similarity == 'cosine': # very, very slow :( self.vectorizer = DictVectorizer() self.uniform_sim = self._sim_cosine elif similarity == 'jaccard': self.uniform_sim = self._sim_jaccard elif similarity == 'normalized_cooccurrence': self.uniform_sim = self._sim_normalized_cooccurrence else: raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence") self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2) self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs) if clustering == 'birch': self._birch = Birch(threshold=0.99, n_clusters=n_clusters) self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix) elif clustering == 'dbscan': self._dbscan = DBSCAN() self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix) elif clustering == 'affinity': self._affinity = AffinityPropagation() self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix) elif clustering is None: self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])] else: raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)") self.no_below_word_count = no_below_word_count self.no_above_word_portion = no_above_word_portion self.max_dictionary_size = max_dictionary_size self.similarity_threshold = similarity_threshold self.min_cluster_size = min_cluster_size self.matrix_smoothing = matrix_smoothing self.compactify = compactify
Example #22
Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_dictvectorizer(): D = [{"foo": 1, "bar": 3}, {"bar": 4, "baz": 2}, {"bar": 1, "quux": 1, "quuux": 2}] for sparse in (True, False): for dtype in (int, np.float32, np.int16): for sort in (True, False): for iterable in (True, False): v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort) X = v.fit_transform(iter(D) if iterable else D) assert_equal(sp.issparse(X), sparse) assert_equal(X.shape, (3, 5)) assert_equal(X.sum(), 14) assert_equal(v.inverse_transform(X), D) if sparse: # CSR matrices can't be compared for equality assert_array_equal(X.A, v.transform(iter(D) if iterable else D).A) else: assert_array_equal(X, v.transform(iter(D) if iterable else D)) if sort: assert_equal(v.feature_names_, sorted(v.feature_names_))
Example #23
Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_feature_selection(): # make two feature dicts with two useful features and a bunch of useless # ones, in terms of chi2 d1 = dict([("useless%d" % i, 10) for i in range(20)], useful1=1, useful2=20) d2 = dict([("useless%d" % i, 10) for i in range(20)], useful1=20, useful2=1) for indices in (True, False): v = DictVectorizer().fit([d1, d2]) X = v.transform([d1, d2]) sel = SelectKBest(chi2, k=2).fit(X, [0, 1]) v.restrict(sel.get_support(indices=indices), indices=indices) assert_equal(v.get_feature_names(), ["useful1", "useful2"])
Example #24
Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_one_of_k(): D_in = [{"version": "1", "ham": 2}, {"version": "2", "spam": .3}, {"version=3": True, "spam": -1}] v = DictVectorizer() X = v.fit_transform(D_in) assert_equal(X.shape, (3, 5)) D_out = v.inverse_transform(X) assert_equal(D_out[0], {"version=1": 1, "ham": 2}) names = v.get_feature_names() assert_true("version=2" in names) assert_false("version" in names)
Example #25
Source File: custom_transformers.py From pandas-pipelines-custom-transformers with MIT License | 5 votes |
def fit(self, X, y=None): # assumes all columns of X are strings Xdict = X.to_dict('records') self.dv = DictVectorizer(sparse=False) self.dv.fit(Xdict) return self
Example #26
Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_feature_selection(): # make two feature dicts with two useful features and a bunch of useless # ones, in terms of chi2 d1 = dict([("useless%d" % i, 10) for i in range(20)], useful1=1, useful2=20) d2 = dict([("useless%d" % i, 10) for i in range(20)], useful1=20, useful2=1) for indices in (True, False): v = DictVectorizer().fit([d1, d2]) X = v.transform([d1, d2]) sel = SelectKBest(chi2, k=2).fit(X, [0, 1]) v.restrict(sel.get_support(indices=indices), indices=indices) assert_equal(v.get_feature_names(), ["useful1", "useful2"])
Example #27
Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_one_of_k(): D_in = [{"version": "1", "ham": 2}, {"version": "2", "spam": .3}, {"version=3": True, "spam": -1}] v = DictVectorizer() X = v.fit_transform(D_in) assert_equal(X.shape, (3, 5)) D_out = v.inverse_transform(X) assert_equal(D_out[0], {"version=1": 1, "ham": 2}) names = v.get_feature_names() assert "version=2" in names assert "version" not in names
Example #28
Source File: test_dict_vectorizer.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_deterministic_vocabulary(): # Generate equal dictionaries with different memory layouts items = [("%03d" % i, i) for i in range(1000)] rng = Random(42) d_sorted = dict(items) rng.shuffle(items) d_shuffled = dict(items) # check that the memory layout does not impact the resulting vocabulary v_1 = DictVectorizer().fit([d_sorted]) v_2 = DictVectorizer().fit([d_shuffled]) assert_equal(v_1.vocabulary_, v_2.vocabulary_)
Example #29
Source File: function_clustering.py From Firmware_Slap with GNU General Public License v3.0 | 5 votes |
def funcs_to_sparse(func_list): vectorizor = DictVectorizer() func_sparse = vectorizor.fit_transform(func_list) return vectorizor, func_sparse
Example #30
Source File: pipeline.py From whereami with GNU Affero General Public License v3.0 | 5 votes |
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")): return make_pipeline(DictVectorizer(sparse=False), clf)