Python keras.preprocessing() Examples

The following are 3 code examples of keras.preprocessing(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module keras , or try the search function

Example #1

Source File: autogen.py From GraphicDesignPatternByPython with MIT License

5 votes

def clean_module_name(name):
    if name.startswith('keras_applications'):
        name = name.replace('keras_applications', 'keras.applications')
    if name.startswith('keras_preprocessing'):
        name = name.replace('keras_preprocessing', 'keras.preprocessing')
    assert name[:6] == 'keras.', 'Invalid module name: %s' % name
    return name

Example #2

Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License

4 votes

def fit(self, X, Y, ngram_range=1, max_features=20000, maxlen=400,
            batch_size=32, embedding_dims=50, epochs=5):
        self.tokenizer = keras.preprocessing.text.Tokenizer(
            num_words=max_features, split=" ", char_level=False)
        self.tokenizer.fit_on_texts(X)
        x_train = self.tokenizer.texts_to_sequences(X)
        self.ngram_range = ngram_range
        self.maxlen = maxlen
        self.add_ngrams = lambda x: x
        if ngram_range > 1:
            ngram_set = set()
            for input_list in x_train:
                for i in range(2, ngram_range + 1):
                    set_of_ngram = create_ngram_set(input_list, ngram_value=i)
                    ngram_set.update(set_of_ngram)

            # Dictionary mapping n-gram token to a unique integer.
            # Integer values are greater than max_features in order
            # to avoid collision with existing features.
            start_index = max_features + 1
            self.token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
            indice_token = {self.token_indice[k]: k for k in self.token_indice}

            # max_features is the highest integer that could be found in the dataset.
            max_features = np.max(list(indice_token.keys())) + 1
            self.add_ngrams = lambda x: add_ngram(x, self.token_indice,
                                                  self.ngram_range)
            x_train = self.add_ngrams(x_train)
            print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
        x_train = sequence.pad_sequences(x_train, maxlen=self.maxlen)
        self.model = Sequential()

        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        self.model.add(Embedding(max_features,
                                 embedding_dims,
                                 input_length=self.maxlen))

        # we add a GlobalAveragePooling1D, which will average the embeddings
        # of all words in the document
        self.model.add(GlobalAveragePooling1D())

        # We project onto a single unit output layer, and squash via sigmoid:
        self.model.add(Dense(1, activation='sigmoid'))

        self.model.compile(loss='binary_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])
        self.model.fit(x_train, Y, batch_size=batch_size, epochs=epochs, verbose=2)

Example #3

Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License

4 votes

def fit(self, X, Y, max_features=20000, maxlen=400,
            batch_size=32, hidden_dims=250, filters=250, kernel_size=3,
            epochs=5):
        from keras.preprocessing import sequence
        from keras.models import Sequential
        from keras.layers import Dense, Dropout, Activation
        from keras.layers import Embedding
        from keras.layers import Conv1D, GlobalMaxPooling1D
        self.tokenizer = keras.preprocessing.text.Tokenizer(
            num_words=max_features, split=" ", char_level=False)
        self.tokenizer.fit_on_texts(X)
        x_train = self.tokenizer.texts_to_sequences(X)
        self.maxlen = maxlen
        embeddings = get_most_common_embeddings(self.tokenizer, self.nlp)
        x_train = sequence.pad_sequences(x_train, maxlen=self.maxlen)
        self.model = Sequential()
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        self.model.add(
            Embedding(
                embeddings.shape[0],
                embeddings.shape[1],
                input_length=maxlen,
                trainable=False,
                weights=[embeddings]
            )
        )

        self.model.add(Dropout(0.2))

        # we add a Convolution1D, which will learn filters
        # word group filters of size filter_length:
        self.model.add(Conv1D(filters, kernel_size, padding='valid',
                              activation='relu', strides=1))
        # we use max pooling:
        self.model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        self.model.add(Dense(hidden_dims))
        self.model.add(Dropout(0.2))
        self.model.add(Activation('relu'))
        # We project onto a single unit output layer, and squash it with a sigmoid:
        self.model.add(Dense(1))
        # model.add(Dense(3))
        self.model.add(Activation('sigmoid'))



        # optimizer = keras.optimizers.Adam(lr=0.001)
        optimizer = keras.optimizers.Adam(lr=0.0001)
        # model.compile(loss='categorical_crossentropy',
        #               optimizer=optimizer,
        #               metrics=['accuracy'])
        self.model.compile(loss='binary_crossentropy',
                           optimizer=optimizer,
                           metrics=['accuracy'])

        self.model.fit(x_train, Y, batch_size=batch_size, epochs=epochs, verbose=2)