Python Examples of sklearn.feature_extraction.text.HashingVectorizer

Source File: vectorizer.py From robotreviewer with GNU General Public License v3.0

6 votes

def transform(self, X_si, high=None, low=None, limit=None):
        """
        Same as HashingVectorizer transform, except allows for 
        interaction list, which is an iterable the same length as X
        filled with True/False. This method adds an empty row to
        docs labelled as False.
        """
        analyzer = self.build_analyzer()

        X = self._get_hasher().transform(
            analyzer(self._deal_with_input(doc)) for doc in X_si)
        
        X.data.fill(1)

        if self.norm is not None:
            X = normalize(X, norm=self.norm, copy=False)

        if low:
            X = self._limit_features(X, low=low)
        return X

Source File: pico_robot.py From robotreviewer with GNU General Public License v3.0

6 votes

def __init__(self):
        self.vectorizer = HashingVectorizer(ngram_range=(1, 2))
        self.dict_vectorizer = DictVectorizer()

        # These are set dynamically in training
        # but fixed here to match the end feature names
        # in the trained model. If the model is retrained then
        # these may have to change
        self.dict_vectorizer.feature_names_ = [
            'DocumentPositionQuintile0',
            'DocumentPositionQuintile1',
            'DocumentPositionQuintile2',
            'DocumentPositionQuintile3',
            'DocumentPositionQuintile4',
            'DocumentPositionQuintile5',
            'DocumentPositionQuintile6']
        self.dict_vectorizer.vocabulary_ = {k: i for i, k in enumerate(self.dict_vectorizer.feature_names_)}

        self.drugbank = Drugbank()

Source File: test_text.py From twitter-stock-recommendation with MIT License

6 votes

def test_pickling_vectorizer():
    instances = [
        HashingVectorizer(),
        HashingVectorizer(norm='l1'),
        HashingVectorizer(binary=True),
        HashingVectorizer(ngram_range=(1, 2)),
        CountVectorizer(),
        CountVectorizer(preprocessor=strip_tags),
        CountVectorizer(analyzer=lazy_analyze),
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
        TfidfVectorizer(),
        TfidfVectorizer(analyzer=lazy_analyze),
        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
    ]

    for orig in instances:
        s = pickle.dumps(orig)
        copy = pickle.loads(s)
        assert_equal(type(copy), orig.__class__)
        assert_equal(copy.get_params(), orig.get_params())
        assert_array_equal(
            copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
            orig.fit_transform(JUNK_FOOD_DOCS).toarray())

Source File: bow.py From broca with MIT License

6 votes

def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        # Wrap the specified tokenizer
        t = Tokenizer(tokenizer())

        if hash:
            vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
        else:
            vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)

        args = [
            ('vectorizer', vectr),
            ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
            ('normalizer', Normalizer(copy=False))
        ]

        self.pipeline = Pipeline(args)
        self.trained = False

Source File: test_text.py From twitter-stock-recommendation with MIT License

6 votes

def test_hashed_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ['aaabc', 'abbde']
    vect = HashingVectorizer(analyzer='char', non_negative=True,
                             norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X[0:1].data), 3)
    assert_equal(np.max(X[1:2].data), 2)
    assert_equal(X.dtype, np.float64)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
                             norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X.data), 1)
    assert_equal(X.dtype, np.float64)

    # check the ability to change the dtype
    vect = HashingVectorizer(analyzer='char', non_negative=True, binary=True,
                             norm=None, dtype=np.float64)
    X = vect.transform(test_data)
    assert_equal(X.dtype, np.float64)

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_hashed_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ['aaabc', 'abbde']
    vect = HashingVectorizer(alternate_sign=False, analyzer='char', norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X[0:1].data), 3)
    assert_equal(np.max(X[1:2].data), 2)
    assert_equal(X.dtype, np.float64)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = HashingVectorizer(analyzer='char', alternate_sign=False,
                             binary=True, norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X.data), 1)
    assert_equal(X.dtype, np.float64)

    # check the ability to change the dtype
    vect = HashingVectorizer(analyzer='char', alternate_sign=False,
                             binary=True, norm=None, dtype=np.float64)
    X = vect.transform(test_data)
    assert_equal(X.dtype, np.float64)

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = (
        "Машинное обучение — обширный подраздел искусственного "
        "интеллекта, изучающий методы построения алгоритмов, "
        "способных обучаться."
        )

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert_equal(X_counted.shape, (1, 12))

    vect = HashingVectorizer(norm=None, alternate_sign=False)
    X_hashed = vect.transform([document])
    assert_equal(X_hashed.shape, (1, 2 ** 20))

    # No collisions on such a small dataset
    assert_equal(X_counted.nnz, X_hashed.nnz)

    # When norm is None and not alternate_sign, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))

Source File: column_encoders.py From datawig with Apache License 2.0

6 votes

def __init__(self,
                 input_columns: Any,
                 output_column: str = None,
                 max_tokens: int = 2 ** 18,
                 tokens: str = 'chars',
                 ngram_range: tuple = None,
                 prefixed_concatenation: bool = True) -> None:

        if ngram_range is None:
            ngram_range = (1, 3) if tokens == 'words' else (1, 5)

        ColumnEncoder.__init__(self, input_columns, output_column, int(max_tokens))

        if tokens == 'words':
            self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range)
        elif tokens == 'chars':
            self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range,
                                                analyzer="char")
        else:
            logger.debug(
                "BowEncoder attribute tokens has to be 'words' or 'chars', defaulting to 'chars'")
            self.vectorizer = HashingVectorizer(n_features=self.output_dim, ngram_range=ngram_range,
                                                analyzer="char")

        self.prefixed_concatenation = prefixed_concatenation

Source File: test_search_2.py From spark-sklearn with Apache License 2.0

6 votes

def test_cv_pipeline(self):
        pipeline = SKL_Pipeline([
            ('vect', SKL_HashingVectorizer(n_features=20)),
            ('tfidf', SKL_TfidfTransformer(use_idf=False)),
            ('lasso', SKL_Lasso())
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }
        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_vectorizer_stop_words_inconsistent():
    lstr = "['and', 'll', 've']"
    message = ('Your stop_words may be inconsistent with your '
               'preprocessing. Tokenizing the stop words generated '
               'tokens %s not in stop_words.' % lstr)
    for vec in [CountVectorizer(),
                TfidfVectorizer(), HashingVectorizer()]:
        vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
        assert_warns_message(UserWarning, message, vec.fit_transform,
                             ['hello world'])
        # reset stop word validation
        del vec._stop_words_id
        assert _check_stop_words_consistency(vec) is False

    # Only one warning per stop list
    assert_no_warnings(vec.fit_transform, ['hello world'])
    assert _check_stop_words_consistency(vec) is None

    # Test caching of inconsistency assessment
    vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
    assert_warns_message(UserWarning, message, vec.fit_transform,
                         ['hello world'])

Source File: common_utils.py From interpret-community with MIT License

5 votes

def create_multiclass_sparse_newsgroups_data():
    remove = ('headers', 'footers', 'quotes')
    categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
    from sklearn.datasets import fetch_20newsgroups
    ngroups = fetch_20newsgroups(subset='train', categories=categories,
                                 shuffle=True, random_state=42, remove=remove)
    x_train, x_test, y_train, y_validation = train_test_split(ngroups.data, ngroups.target,
                                                              test_size=0.02, random_state=42)
    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                   n_features=2**16)
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    return x_train, x_test, y_train, y_validation, categories, vectorizer

Source File: ngrams_handling.py From JaSt with GNU General Public License v3.0

5 votes

def csr_proba_of_n_grams_hash_storage(input_file, tolerance, n, n_features):
    """
        Maps an input file to a CSR matrix containing the frequency of its n-grams.
        - Production of n-grams and analysis of their frequency (+ normalization);
        - Each n-grams is mapped to a consistent dimension of a vector space (with an hash,
        collision possible if n_features is to small)
        - Storage of the results in a CSR matrix.

        -------
        Parameters:
        - input_file: str
            Path of the file to be analysed.
        - tolerance: str
            Indicates whether esprima should tolerate a few cases of syntax errors
            (corresponds to esprima's tolerant option).
            The values 'true' and 'false' shall be used to enable this tolerant mode.
        - n: int
            Stands for the size of the sliding-window which goes through the units contained
            in the files to be analysed.
        - n_features: int
            Size of the resulting vector space. This can be changed in nb_features(n).

        -------
        Returns:
        - csr_matrix
            Non-compacted dimension: 1 x n_features;
            Value: probability of occurrences of an n-gram.
        - or None if the file could not be parsed.
    """

    tokens_int = tokens.tokens_to_numbers(input_file, tolerance)
    if tokens_int is not None:
        corpus = [str(tokens_int)]
        vectorizer = HashingVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(n, n), norm='l1',
                                       alternate_sign=False, n_features=n_features)
        res = vectorizer.fit_transform(corpus)

        return res
    return None

Source File: test_simple_imputer.py From datawig with Apache License 2.0

5 votes

def test_hpo_explainable(test_dir, data_frame):
    from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col,
                    label_col=label_col)

    for explainable, vectorizer in [(False, HashingVectorizer), (True, TfidfVectorizer)]:
        imputer = SimpleImputer(
            input_columns=[feature_col],
            output_column=label_col,
            output_path=test_dir,
            is_explainable=explainable
        ).fit_hpo(df, num_epochs=3)
        assert isinstance(imputer.imputer.data_encoders[0].vectorizer, vectorizer)

Source File: test_text.py From twitter-stock-recommendation with MIT License

5 votes

def test_word_analyzer_unigrams():
    for Vectorizer in (CountVectorizer, HashingVectorizer):
        wa = Vectorizer(strip_accents='ascii').build_analyzer()
        text = ("J'ai mang\xe9 du kangourou  ce midi, "
                "c'\xe9tait pas tr\xeas bon.")
        expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
                    'etait', 'pas', 'tres', 'bon']
        assert_equal(wa(text), expected)

        text = "This is a test, really.\n\n I met Harry yesterday."
        expected = ['this', 'is', 'test', 'really', 'met', 'harry',
                    'yesterday']
        assert_equal(wa(text), expected)

        wa = Vectorizer(input='file').build_analyzer()
        text = StringIO("This is a test with a file-like object!")
        expected = ['this', 'is', 'test', 'with', 'file', 'like',
                    'object']
        assert_equal(wa(text), expected)

        # with custom preprocessor
        wa = Vectorizer(preprocessor=uppercase).build_analyzer()
        text = ("J'ai mang\xe9 du kangourou  ce midi, "
                " c'\xe9tait pas tr\xeas bon.")
        expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI',
                    'ETAIT', 'PAS', 'TRES', 'BON']
        assert_equal(wa(text), expected)

        # with custom tokenizer
        wa = Vectorizer(tokenizer=split_tokenize,
                        strip_accents='ascii').build_analyzer()
        text = ("J'ai mang\xe9 du kangourou  ce midi, "
                "c'\xe9tait pas tr\xeas bon.")
        expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,',
                    "c'etait", 'pas', 'tres', 'bon.']
        assert_equal(wa(text), expected)

Source File: test_text.py From twitter-stock-recommendation with MIT License

5 votes

def test_hashing_vectorizer():
    v = HashingVectorizer()
    X = v.transform(ALL_FOOD_DOCS)
    token_nnz = X.nnz
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # By default the hashed values receive a random sign and l2 normalization
    # makes the feature values bounded
    assert_true(np.min(X.data) > -1)
    assert_true(np.min(X.data) < 0)
    assert_true(np.max(X.data) > 0)
    assert_true(np.max(X.data) < 1)

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)

    # Check vectorization with some non-default parameters
    v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1')
    X = v.transform(ALL_FOOD_DOCS)
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # ngrams generate more non zeros
    ngrams_nnz = X.nnz
    assert_true(ngrams_nnz > token_nnz)
    assert_true(ngrams_nnz < 2 * token_nnz)

    # makes the feature values bounded
    assert_true(np.min(X.data) > 0)
    assert_true(np.max(X.data) < 1)

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)

Source File: test_text.py From twitter-stock-recommendation with MIT License

5 votes

def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = (
        "\xd0\x9c\xd0\xb0\xd1\x88\xd0\xb8\xd0\xbd\xd0\xbd\xd0\xbe\xd0"
        "\xb5 \xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb5\xd0\xbd\xd0\xb8\xd0"
        "\xb5 \xe2\x80\x94 \xd0\xbe\xd0\xb1\xd1\x88\xd0\xb8\xd1\x80\xd0\xbd"
        "\xd1\x8b\xd0\xb9 \xd0\xbf\xd0\xbe\xd0\xb4\xd1\x80\xd0\xb0\xd0\xb7"
        "\xd0\xb4\xd0\xb5\xd0\xbb \xd0\xb8\xd1\x81\xd0\xba\xd1\x83\xd1\x81"
        "\xd1\x81\xd1\x82\xd0\xb2\xd0\xb5\xd0\xbd\xd0\xbd\xd0\xbe\xd0\xb3"
        "\xd0\xbe \xd0\xb8\xd0\xbd\xd1\x82\xd0\xb5\xd0\xbb\xd0\xbb\xd0"
        "\xb5\xd0\xba\xd1\x82\xd0\xb0, \xd0\xb8\xd0\xb7\xd1\x83\xd1\x87"
        "\xd0\xb0\xd1\x8e\xd1\x89\xd0\xb8\xd0\xb9 \xd0\xbc\xd0\xb5\xd1\x82"
        "\xd0\xbe\xd0\xb4\xd1\x8b \xd0\xbf\xd0\xbe\xd1\x81\xd1\x82\xd1\x80"
        "\xd0\xbe\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xb0\xd0\xbb\xd0\xb3"
        "\xd0\xbe\xd1\x80\xd0\xb8\xd1\x82\xd0\xbc\xd0\xbe\xd0\xb2, \xd1\x81"
        "\xd0\xbf\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd1\x8b\xd1\x85 "
        "\xd0\xbe\xd0\xb1\xd1\x83\xd1\x87\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1"
        "\x8f.")

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert_equal(X_counted.shape, (1, 15))

    vect = HashingVectorizer(norm=None, non_negative=True)
    X_hashed = vect.transform([document])
    assert_equal(X_hashed.shape, (1, 2 ** 20))

    # No collisions on such a small dataset
    assert_equal(X_counted.nnz, X_hashed.nnz)

    # When norm is None and non_negative, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))

Source File: test_text.py From twitter-stock-recommendation with MIT License

5 votes

def test_vectorizer_string_object_as_input():
    message = ("Iterable over raw text documents expected, "
               "string object received.")
    for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]:
        assert_raise_message(
            ValueError, message, vec.fit_transform, "hello world!")
        assert_raise_message(
            ValueError, message, vec.fit, "hello world!")
        assert_raise_message(
            ValueError, message, vec.transform, "hello world!")

Source File: common_utils.py From interpret-community with MIT License

5 votes

def create_binary_sparse_newsgroups_data():
    categories = ['alt.atheism', 'soc.religion.christian']
    newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
    newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
    class_names = ['atheism', 'christian']
    x_train = newsgroups_train.data
    x_test = newsgroups_test.data
    y_train = newsgroups_train.target
    y_validation = newsgroups_test.target
    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                   n_features=2**16)
    x_train = vectorizer.transform(x_train)
    x_test = vectorizer.transform(x_test)
    return x_train, x_test, y_train, y_validation, class_names, vectorizer

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hashingvectorizer_nan_in_docs():
    # np.nan can appear when using pandas to load text fields from a csv file
    # with missing values.
    message = "np.nan is an invalid document, expected byte or unicode string."
    exception = ValueError

    def func():
        hv = HashingVectorizer()
        hv.fit_transform(['hello world', np.nan, 'hello hello'])

    assert_raise_message(exception, message, func)

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_pickling_vectorizer():
    instances = [
        HashingVectorizer(),
        HashingVectorizer(norm='l1'),
        HashingVectorizer(binary=True),
        HashingVectorizer(ngram_range=(1, 2)),
        CountVectorizer(),
        CountVectorizer(preprocessor=strip_tags),
        CountVectorizer(analyzer=lazy_analyze),
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
        TfidfVectorizer(),
        TfidfVectorizer(analyzer=lazy_analyze),
        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
    ]

    for orig in instances:
        s = pickle.dumps(orig)
        copy = pickle.loads(s)
        assert_equal(type(copy), orig.__class__)
        assert_equal(copy.get_params(), orig.get_params())
        if IS_PYPY and isinstance(orig, HashingVectorizer):
            continue
        else:
            assert_array_equal(
                copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
                orig.fit_transform(JUNK_FOOD_DOCS).toarray())

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_hashing_vectorizer():
    v = HashingVectorizer()
    X = v.transform(ALL_FOOD_DOCS)
    token_nnz = X.nnz
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # By default the hashed values receive a random sign and l2 normalization
    # makes the feature values bounded
    assert np.min(X.data) > -1
    assert np.min(X.data) < 0
    assert np.max(X.data) > 0
    assert np.max(X.data) < 1

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)

    # Check vectorization with some non-default parameters
    v = HashingVectorizer(ngram_range=(1, 2), norm='l1')
    X = v.transform(ALL_FOOD_DOCS)
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # ngrams generate more non zeros
    ngrams_nnz = X.nnz
    assert ngrams_nnz > token_nnz
    assert ngrams_nnz < 2 * token_nnz

    # makes the feature values bounded
    assert np.min(X.data) > -1
    assert np.max(X.data) < 1

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)

Source File: rct_robot.py From robotreviewer with GNU General Public License v3.0

5 votes

def __init__(self):
        from keras.preprocessing import sequence
        from keras.models import load_model
        from keras.models import Sequential
        from keras.preprocessing import sequence
        from keras.layers import Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        from keras.layers import Embedding
        from keras.layers import Convolution1D, MaxPooling1D
        from keras import backend as K
        from keras.models import Model
        from keras.regularizers import l2
        global sequence, load_model, Sequential, Dense, Dropout, Activation, Lambda, Input, merge, Flatten
        global Embedding, Convolution1D, MaxPooling1D, K, Model, l2
        self.svm_clf = MiniClassifier(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))
        cnn_weight_files = glob.glob(os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        self.cnn_clfs = [load_model(cnn_weight_file) for cnn_weight_file in cnn_weight_files]
        self.svm_vectorizer = HashingVectorizer(binary=False, ngram_range=(1, 1), stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(robotreviewer.DATA_ROOT, 'rct/cnn_vocab_map.pck'), stop_words='english')
        with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_model_calibration.json'), 'r') as f:
            self.constants = json.load(f)

        self.calibration_lr = {}
        with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_ptyp_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn_ptyp'] = pickle.load(f)

        with open(os.path.join(robotreviewer.DATA_ROOT, 'rct/svm_cnn_calibration.pck'), 'rb') as f:
            self.calibration_lr['svm_cnn'] = pickle.load(f)

Source File: bias_ab_robot.py From robotreviewer with GNU General Public License v3.0

5 votes

def __init__(self):
        
        with open(robotreviewer.get_data(os.path.join('bias_ab', 'bias_prob_clf.pck')), 'rb') as f:
            self.clf = pickle.load(f)

        self.vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english')

Source File: pubmed_robot.py From robotreviewer with GNU General Public License v3.0

5 votes

def __init__(self):
        raw_data = np.load(robotreviewer.get_data('pubmed/pubmed_title_hash_2016_07_24.npz'))
        self.vec_ti = csr_matrix((raw_data['data'], raw_data['indices'], raw_data['indptr']), raw_data['shape'])
        self.pmid_ind = np.load(robotreviewer.get_data('pubmed/pubmed_index_2016_07_24.npz'))['pmid_ind']
        self.vectorizer = HashingVectorizer(binary=True, stop_words='english')
        # load database
        self.connection = sqlite3.connect(robotreviewer.get_data('pubmed/pubmed_rcts_2016_07_24.sqlite'))
        self.c = self.connection.cursor()

Source File: similarity_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

5 votes

def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128,
                          ngram_range=(3, 3), sparse=False, sample_weight=None,
                          random_state=None):
    """
    Computes prototypes based on:
      - dimensionality reduction (via hashing n-grams)
      - k-means clustering
      - nearest neighbor
    """
    vectorizer = HashingVectorizer(analyzer='char', norm=None,
                                   alternate_sign=False,
                                   ngram_range=ngram_range,
                                   n_features=hashing_dim)
    projected = vectorizer.transform(X)
    if not sparse:
        projected = projected.toarray()
    kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state)
    kmeans.fit(projected, sample_weight=sample_weight)
    centers = kmeans.cluster_centers_
    neighbors = NearestNeighbors()
    neighbors.fit(projected)
    indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1])
    if indexes_prototypes.shape[0] < n_prototypes:
        warnings.warn('Final number of unique prototypes is lower than ' +
                      'n_prototypes (expected)')
    return np.sort(X[indexes_prototypes])

Source File: pipeline_builder.py From texta with GNU General Public License v3.0

4 votes

def get_pipeline_builder():
    pipe_builder = PipelineBuilder()

    # Feature Extraction
    params = {}
    pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params)

    params = {'ngram_range': [(1, 1), (1, 2)], 'min_df': [5]}
    pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params)

    params = {'ngram_range': [(1, 1), (1, 2)], 'min_df': [5]}
    pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params)

    # Dimension Reduction
    params = {}
    pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params)

    # Normalization
    params = {}
    pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params)

    params = {}
    pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params)

    # Classification Models

    params = {}
    pipe_builder.add_classifier('LogisticRegressionClassifier', LogisticRegression, 'Logistic Regression', params)

    params = {}
    pipe_builder.add_classifier('LinearSVC', LinearSVC, 'LinearSVC', params)

    params = {}
    pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params)

    params = {}
    pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params)

    return pipe_builder

Source File: similarity_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

4 votes

def ngram_similarity(X, cats, ngram_range, hashing_dim, dtype=np.float64):
    """
    Similarity encoding for dirty categorical variables:
        Given to arrays of strings, returns the
        similarity encoding matrix of size
        len(X) x len(cats)

    ngram_sim(s_i, s_j) =
        ||min(ci, cj)||_1 / (||ci||_1 + ||cj||_1 - ||min(ci, cj)||_1)
    """
    min_n, max_n = ngram_range
    unq_X = np.unique(X)
    cats = np.array([' %s ' % cat for cat in cats])
    unq_X_ = np.array([' %s ' % x for x in unq_X])
    if not hashing_dim:
        vectorizer = CountVectorizer(analyzer='char',
                                     ngram_range=(min_n, max_n),
                                     dtype=dtype)
        vectorizer.fit(np.concatenate((cats, unq_X_)))
    else:
        vectorizer = HashingVectorizer(analyzer='char',
                                       ngram_range=(min_n, max_n),
                                       n_features=hashing_dim, norm=None,
                                       alternate_sign=False,
                                       dtype=dtype)
        vectorizer.fit(X)
    count_cats = vectorizer.transform(cats)
    count_X = vectorizer.transform(unq_X_)
    # We don't need the vectorizer anymore, delete it to save memory
    del vectorizer
    sum_cats = np.asarray(count_cats.sum(axis=1))
    SE_dict = {}

    for i, x in enumerate(count_X):
        _, nonzero_idx, nonzero_vals = sparse.find(x)
        samegrams = np.asarray(
            (count_cats[:, nonzero_idx].minimum(nonzero_vals)).sum(axis=1))
        allgrams = x.sum() + sum_cats - samegrams
        similarity = np.divide(samegrams, allgrams)
        SE_dict[unq_X[i]] = similarity.reshape(-1)
    # We don't need the counts anymore, delete them to save memory
    del count_cats, count_X

    out = np.empty((len(X), similarity.size), dtype=dtype)
    for x, out_row in zip(X, out):
        out_row[:] = SE_dict[x]

    return np.nan_to_num(out, copy=False)

Python sklearn.feature_extraction.text.HashingVectorizer() Examples