Python Examples of sklearn.preprocessing.scale

Source File: FFM_Multi_PyTorch.py From Awesome-RecSystem-Models with MIT License

7 votes

def train_FFM_model_demo():

    # Step1: 导入数据
    x_train, y_train, x_test, y_test, feature2field = load_dataset()
    x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
    x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)
    class_num = len(set([y for y in y_train] + [y for y in y_test]))

    # FFM模型
    ffm = FFM_layer(field_map_dict=feature2field, fea_num=x_train.shape[1], reg_l1=0.01, reg_l2=0.01,
                    class_num=class_num, latent_factor_dim=10).to(DEVICE)

    # 定义损失函数还有优化器
    optm = torch.optim.Adam(ffm.parameters())

    train_loader = get_batch_loader(x_train, y_train, BATCH_SIZE, shuffle=True)
    test_loader = get_batch_loader(x_test, y_test, BATCH_SIZE, shuffle=False)

    for epoch in range(1, EPOCHS + 1):
        train(ffm, DEVICE, train_loader, optm, epoch)
        test(ffm, DEVICE, test_loader)

Source File: pre_processing.py From BERMUDA with MIT License

6 votes

def pre_processing(dataset_file_list, pre_process_paras):
    """ pre-processing of multiple datasets
    Args:
        dataset_file_list: list of filenames of datasets
        pre_process_paras: dict, parameters for pre-processing
    Returns:
        dataset_list: list of datasets
    """
    # parameters
    take_log = pre_process_paras['take_log']
    standardization = pre_process_paras['standardization']
    scaling = pre_process_paras['scaling']

    dataset_list = []
    for data_file in dataset_file_list:
        dataset = read_csv(data_file, take_log)
        if standardization:
            scale(dataset['gene_exp'], axis=1, with_mean=True, with_std=True, copy=False)
        if scaling:  # scale to [0,1]
            minmax_scale(dataset['gene_exp'], feature_range=(0, 1), axis=1, copy=False)
        dataset_list.append(dataset)
    dataset_list = intersect_dataset(dataset_list)  # retain intersection of gene symbols

    return dataset_list

Source File: Get_flow_ev.py From MultipleFactorRiskModel with MIT License

6 votes

def get_ind_return(data):
    '''
    将从xlsx中读取出来按列拼接好的数据进行重组，计算出每个行业每个月的收益率
    :param [DataFrame] data: 从xlsx文件中读取的月份-交易数据
    :return: [DataFrame] ind_ret: 月份*行业 每个行业每个月的收益率
    '''
    # 读入stk_ind_pair.xlsx，用作股票和其所属行业的对照表
    stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx')
    # 把stk_ind里面股票代码数字部分后面的字母去掉
    stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6])
    # 对stk_ind和data进行merge操作，将行业信息插入data
    data = pd.merge(data, stk_ind, on='Stkcd')
    # 按照月份和行业分组
    groups = data.groupby(['Trdmnt', 'ind'])
    # 分组计算每个月每个行业的总市值
    total_Ms = groups['Msmvttl'].sum()
    # 分组计算每个月每个行业按照市值加权的收益率
    total_Mr=groups['total_Mr'].sum()
    # 相除得到每个月每个行业的平均收益率
    ind_ret=total_Mr/total_Ms
    # 将ind_ret的内层level转换为列
    ind_ret=ind_ret.unstack()
    #将ind_ret标准化
    ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns)
    return ind_ret

Source File: atlas3.py From ssbio with MIT License

6 votes

def run_pca(self, whiten=True):
        # Normalize
        for_pca_df = self.features_df.T
        for_pca_df_scaled = pd.DataFrame(preprocessing.scale(for_pca_df), columns=for_pca_df.columns)

        # Run PCA
        self.num_components = min(len(for_pca_df.T.columns), len(for_pca_df.T.index))
        pca = PCA(n_components=self.num_components, whiten=whiten)
        pca_fit = pca.fit_transform(for_pca_df_scaled)
        self.pc_names_list = ['PC{} ({:.0%})'.format(x + 1, pca.explained_variance_ratio_[x]) for x in
                                  range(self.num_components)]
        self.pc_names_dict = {k.split(' ')[0]: k for k in self.pc_names_list}
        principal_df = pd.DataFrame(data=pca_fit, columns=self.pc_names_list, index=for_pca_df.index)
        principal_df.index.name = 'strain'

        self.principal_df = principal_df
        self.pca = pca
        # self.principal_observations_df = self.principal_df.join(self.observations_df, how='inner')
        #
        # # Make iterable list of markers
        # mks = itertools.cycle(["<", "+", "o", 'D', 'x', '^', '*', '8', 's', 'p', 'v', 'X', '_', 'h'])
        # self.markers = [next(mks) for i in range(len(self.principal_observations_df[self.observation_colname].unique()))]

Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_elastic_net_versus_sgd(C, l1_ratio):
    # Compare elasticnet penalty in LogisticRegression() and SGD(loss='log')
    n_samples = 500
    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, n_repeated=0,
                               random_state=1)
    X = scale(X)

    sgd = SGDClassifier(
        penalty='elasticnet', random_state=1, fit_intercept=False, tol=-np.inf,
        max_iter=2000, l1_ratio=l1_ratio, alpha=1. / C / n_samples, loss='log')
    log = LogisticRegression(
        penalty='elasticnet', random_state=1, fit_intercept=False, tol=1e-5,
        max_iter=1000, l1_ratio=l1_ratio, C=C, solver='saga')

    sgd.fit(X, y)
    log.fit(X, y)
    assert_array_almost_equal(sgd.coef_, log.coef_, decimal=1)

Source File: analysis.py From smallrnaseq with GNU General Public License v3.0

6 votes

def do_pca(X, c=3):
    """Do PCA"""

    from sklearn import preprocessing
    from sklearn.decomposition.pca import PCA, RandomizedPCA
    #do PCA
    #S = standardize_data(X)
    S = pd.DataFrame(preprocessing.scale(X),columns = X.columns)
    pca = PCA(n_components=c)
    pca.fit(S)
    print (pca.explained_variance_ratio_)
    #print pca.components_
    w = pd.DataFrame(pca.components_,columns=S.columns)#,index=['PC1','PC2'])
    #print w.T.max(1).sort_values()
    pX = pca.fit_transform(S)
    pX = pd.DataFrame(pX,index=X.index)
    return pX

Source File: utils.py From SCALE with MIT License

6 votes

def estimate_k(data):
    """
    Estimate number of groups k:
        based on random matrix theory (RTM), borrowed from SC3
        input data is (p,n) matrix, p is feature, n is sample
    """
    p, n = data.shape
    if type(data) is not np.ndarray:
        data = data.toarray()
    x = scale(data)
    muTW = (np.sqrt(n-1) + np.sqrt(p)) ** 2
    sigmaTW = (np.sqrt(n-1) + np.sqrt(p)) * (1/np.sqrt(n-1) + 1/np.sqrt(p)) ** (1/3)
    sigmaHatNaive = x.T.dot(x)

    bd = np.sqrt(p) * sigmaTW + muTW
    evals = np.linalg.eigvalsh(sigmaHatNaive)

    k = 0
    for i in range(len(evals)):
        if evals[i] > bd:
            k += 1
    return k

Source File: industry_return.py From MultipleFactorRiskModel with MIT License

6 votes

def get_ind_return(data):
    '''
    将从xlsx中读取出来按列拼接好的数据进行重组，计算出每个行业每个月的收益率
    :param [DataFrame] data: 从xlsx文件中读取的月份-交易数据
    :return: [DataFrame] ind_ret: 月份*行业 每个行业每个月的收益率
    '''
    # 读入stk_ind_pair.xlsx，用作股票和其所属行业的对照表
    stk_ind = pd.read_excel('E:\\QuantProject2\\temp_data\\stk_ind_pair.xlsx')
    # 把stk_ind里面股票代码数字部分后面的字母去掉
    stk_ind.Stkcd = stk_ind.Stkcd.apply(lambda x: x[:6])
    # 对stk_ind和data进行merge操作，将行业信息插入data
    data = pd.merge(data, stk_ind, on='Stkcd')
    # 按照月份和行业分组
    groups = data.groupby(['Trdmnt', 'ind'])
    # 分组计算每个月每个行业的总市值
    total_Ms = groups['Msmvttl'].sum()
    # 分组计算每个月每个行业按照市值加权的收益率
    total_Mr=groups['total_Mr'].sum()
    # 相除得到每个月每个行业的平均收益率
    ind_ret=total_Mr/total_Ms
    # 将ind_ret的内层level转换为列
    ind_ret=ind_ret.unstack()
    #将ind_ret标准化
    ind_ret=pd.DataFrame(scale(ind_ret),columns=ind_ret.columns)
    return ind_ret

Source File: cluster.py From cmdbac with Apache License 2.0

6 votes

def kmeans_elbow(data):
    bin_ = Bin(0, 0)
    # processed_data = scale(data)
    data = np.array(data)
    bin_.fit(data)
    processed_data = bin_.transform(data)
    # processed_data = scale(data)

    inertias = []
    for k in K_RANGE:
        kmeans = KMeans(init='k-means++', n_clusters=k)
        kmeans.fit(processed_data)
        inertias.append(kmeans.inertia_)

    fig = plt.figure()
    plt.scatter(K_RANGE, inertias)
    plt.plot(K_RANGE, inertias)
    fig.savefig('kmeans-elbow.png')

Source File: FM_Multi_PyTorch.py From Awesome-RecSystem-Models with MIT License

6 votes

def train_FM_model_demo():

    # Step1: 导入数据
    x_train, y_train, x_test, y_test = load_dataset()
    x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
    x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)
    class_num = len(set([y for y in y_train] + [y for y in y_test]))

    # FM模型
    fm = FM_layer(class_num=class_num, feature_num=x_train.shape[1], latent_factor_dim=40).to(DEVICE)

    # 定义损失函数还有优化器
    optm = torch.optim.Adam(fm.parameters())

    train_loader = get_batch_loader(x_train, y_train, BATCH_SIZE, shuffle=True)
    test_loader = get_batch_loader(x_test, y_test, BATCH_SIZE, shuffle=False)

    for epoch in range(1, EPOCHS + 1):
        train(fm, DEVICE, train_loader, optm, epoch)
        test(fm, DEVICE, test_loader)

Source File: umbilical.py From geosketch with MIT License

6 votes

def violin_jitter(X, genes, gene, labels, focus, background=None,
                  xlabels=None):
    gidx = list(genes).index(gene)

    focus_idx = focus == labels
    if background is None:
        background_idx = focus != labels
    else:
        background_idx = background == labels

    if xlabels is None:
        xlabels = [ 'Background', 'Focus' ]

    x_gene = X[:, gidx].toarray().flatten()
    x_focus = x_gene[focus_idx]
    x_background = x_gene[background_idx]
    
    plt.figure()
    sns.violinplot(data=[ x_focus, x_background ], scale='width', cut=0)
    sns.stripplot(data=[ x_focus, x_background ], jitter=True, color='black', size=1)
    plt.xticks([0, 1], xlabels)
    plt.savefig('{}_violin_{}.png'.format(NAMESPACE, gene))

Source File: brain_data.py From nltools with MIT License

6 votes

def scale(self, scale_val=100.):
        """ Scale all values such that they are on the range [0, scale_val],
            via grand-mean scaling. This is NOT global-scaling/intensity
            normalization. This is useful for ensuring that data is on a
            common scale (e.g. good for multiple runs, participants, etc)
            and if the default value of 100 is used, can be interpreted as
            something akin to (but not exactly) "percent signal change."
            This is consistent with default behavior in AFNI and SPM.
            Change this value to 10000 to make consistent with FSL.

        Args:
            scale_val: (int/float) what value to send the grand-mean to;
                        default 100

        """

        out = deepcopy(self)
        out.data = out.data / out.data.mean() * scale_val

        return out

Source File: brain_data.py From nltools with MIT License

6 votes

def standardize(self, axis=0, method='center'):
        ''' Standardize Brain_Data() instance.

        Args:
            axis: 0 for observations 1 for voxels
            method: ['center','zscore']

        Returns:
            Brain_Data Instance

        '''

        if axis == 1 and len(self.shape()) == 1:
            raise IndexError("Brain_Data is only 3d but standardization was requested over observations")
        out = self.copy()
        if method == 'zscore':
            with_std = True
        elif method == 'center':
            with_std = False
        else:
            raise ValueError('method must be ["center","zscore"')
        out.data = scale(out.data, axis=axis, with_std=with_std)
        return out

Source File: __init__.py From marconibot with GNU General Public License v3.0

6 votes

def train(self, df, shuffle=True, preprocess=False, *args, **kwargs):
        """
        Takes a dataframe of features + a 'label' column and trains the lobe
        """
        if self._trained:
            logger.warning('Overwriting an already trained brain!')
            self._trained = False

        # shuffle data for good luck
        if shuffle:
            df = shuffleDataFrame(df)
        # scale train data and fit lobe
        x = df.drop('label', axis=1).values
        y = df['label'].values
        del df
        if preprocess:
            x = preprocessing.scale(x)
        logger.info('Training with %d samples', len(x))
        self.lobe.fit(x, y)
        self._trained = True

Source File: test_nn.py From mljar-supervised with MIT License

6 votes

def setUpClass(cls):
        cls.X, cls.y = datasets.make_regression(
            n_samples=100, n_features=5, n_informative=4, shuffle=False, random_state=0
        )

        cls.params = {
            "dense_layers": 2,
            "dense_1_size": 8,
            "dense_2_size": 4,
            "dropout": 0,
            "learning_rate": 0.01,
            "momentum": 0.9,
            "decay": 0.001,
            "ml_task": "regression"
        }

        cls.y = preprocessing.scale(cls.y)

Source File: pca_autoencoder.py From safekit with MIT License

6 votes

def train(train_data, outfile):
        """
        :param train_data: A Batcher object that delivers batches of train data.
        :param outfile: (str) Where to print results.
        """
        outfile.write('day user red loss\n')
        mat = train_data.next_batch()
        while mat is not None:
            datadict = {'features': mat[:, 3:], 'red': mat[:,2], 'user': mat[:,1], 'day': mat[:,0]}
            batch = scale(datadict['features'])
            pca = PCA(n_components=1)
            pca.fit(batch)
            data_reduced = np.dot(batch, pca.components_.T) # pca transform
            data_original = np.dot(data_reduced, pca.components_) # inverse_transform
            pointloss = np.mean(np.square(batch - data_original), axis=1)
            loss = np.mean(pointloss)
            for d, u, t, l, in zip(datadict['day'].tolist(), datadict['user'].tolist(),
                                   datadict['red'].tolist(), pointloss.flatten().tolist()):
                outfile.write('%s %s %s %s\n' % (d, u, t, l))
            print('loss: %.4f' % loss)
            mat = train_data.next_batch()

Source File: preprocessing_utils.py From mljar-supervised with MIT License

6 votes

def is_log_scale_needed(x_org):
        x = np.array(x_org[~pd.isnull(x_org)])
        # first scale on raw data
        x = preprocessing.scale(x)
        # second scale on log data
        x_log = preprocessing.scale(np.log(x - np.min(x) + 1))

        # the old approach, let's check how new approach will work
        # original_skew = np.abs(stats.skew(x))
        # log_skew = np.abs(stats.skew(x_log))
        # return log_skew < original_skew
        ########################################################################
        # p is probability of being normal distributions
        k2, p1 = stats.normaltest(x)
        k2, p2 = stats.normaltest(x_log)

        return p2 > p1

Source File: demo.py From MRI-tumor-segmentation-Brats with MIT License

5 votes

def vox_preprocess(vox):
    vox_shape = vox.shape
    vox = np.reshape(vox, (-1, vox_shape[-1]))
    vox = scale(vox, axis=0)
    return np.reshape(vox, vox_shape)

Source File: temp.py From neural-finance with Apache License 2.0

5 votes

def sk_scale(X):
    return scale(X, axis=0, with_mean=True, with_std=True, copy=True )

Source File: neural_data.py From neural-finance with Apache License 2.0

5 votes

def sk_min_max(X):
    min_max_scaler = MinMaxScaler()
    # X = scale(X, axis=0, with_mean=True, with_std=True, copy=True)
    return min_max_scaler.fit_transform(X)

Source File: train.py From MRI-tumor-segmentation-Brats with MIT License

5 votes

def vox_preprocess(vox):
    vox_shape = vox.shape
    vox = np.reshape(vox, (-1, vox_shape[-1]))
    vox = scale(vox, axis=0)
    return np.reshape(vox, vox_shape)

Source File: test.py From MRI-tumor-segmentation-Brats with MIT License

5 votes

def DenseNetTransit(x, rate=1, name=None):
    if rate != 1:
        out_features = x.get_shape().as_list()[-1] * rate
        x = BatchNormalization(center=True, scale=True, name=name + '_bn')(x)
        x = Activation('relu', name=name + '_relu')(x)
        x = Conv3D(filters=out_features, kernel_size=1, strides=1, padding='same', kernel_initializer='he_normal',
                   use_bias=False, name=name + '_conv')(x)
    x = AveragePooling3D(pool_size=2, strides=2, padding='same')(x)
    return x

Source File: neural_data.py From neural-finance with Apache License 2.0

5 votes

def sk_scale(X):
    return scale(X, axis=0, with_mean=True, with_std=True, copy=True )

Source File: test.py From MRI-tumor-segmentation-Brats with MIT License

5 votes

def DenseNetUnit3D(x, growth_rate, ksize, n, bn_decay=0.99):
    for i in range(n):
        concat = x
        x = BatchNormalization(center=True, scale=True, momentum=bn_decay)(x)
        x = Activation('relu')(x)
        x = Conv3D(filters=growth_rate, kernel_size=ksize, padding='same', kernel_initializer='he_uniform',
                   use_bias=False)(x)
        x = concatenate([concat, x])
    return x

Source File: test.py From MRI-tumor-segmentation-Brats with MIT License

5 votes

def vox_preprocess(vox):
    vox_shape = vox.shape
    vox = np.reshape(vox, (-1, vox_shape[-1]))
    vox = scale(vox, axis=0)
    return np.reshape(vox, vox_shape)

Source File: tensorflow_input_data.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def inputData(firstFileNo, lastFileNo):
    # print('目前系统的编码为：',sys.getdefaultencoding()) 

    '''  训练数据生成  '''
    path = 'C:/锚索测量数据库/LSTMs训练数据/' 
    path_TagJson = 'C:/锚索测量数据库/LSTMs训练数据/tag.json'

    TagDict = myJsonLoad(path_TagJson)

    all_cwtmatr = []
    all_labels_oneHot = []

    for number in range(firstFileNo, lastFileNo+1):
        # number = 5 # 1~5
        cwtmatr_cwt = opeanFile(path + str(number)+'.seg')
        for i in range(len(cwtmatr_cwt)):    
            cwtmatr = np.array( cwtmatr_cwt[i] )
            # 逐行 Z-Score 标准化
            cwtmatr = preprocessing.scale(cwtmatr, axis =1)
            all_cwtmatr.append(cwtmatr)
            # MyPlot(cwtmatr)
        
            labels = TagDict[number-1]['items']
            labels, labels_oneHot = MyLabels( labels )
            all_labels_oneHot.append( np.array(labels_oneHot) )
            # plt.plot(labels)

            # plt.show() 

    all_cwtmatr = np.array(all_cwtmatr )
    all_labels_oneHot = np.array(all_labels_oneHot )

    # print(all_cwtmatr.shape)
    # print(all_labels_oneHot.shape)
    # print(all_labels_oneHot[0].shape)
    
    return all_cwtmatr, all_labels_oneHot

Source File: prepare_data.py From acerta-abide with GNU General Public License v2.0

5 votes

def load_patient(subj, tmpl):
    df = pd.read_csv(format_config(tmpl, {
        "subject": subj,
    }), sep="\t", header=0)
    df = df.apply(lambda x: pd.to_numeric(x, errors='coerce'))

    ROIs = ["#" + str(y) for y in sorted([int(x[1:]) for x in df.keys().tolist()])]

    functional = np.nan_to_num(df[ROIs].to_numpy().T).tolist()
    functional = preprocessing.scale(functional, axis=1)
    functional = compute_connectivity(functional)
    functional = functional.astype(np.float32)

    return subj, functional.tolist()

Source File: LDA.py From pynoddy with GNU General Public License v2.0

5 votes

def scatter_plot( self, **kwds ):
        '''
        Generates a scatter plot using the first 2 princpal components as axes.
        
        **Keywords**:
         - *vectors* = True if original axes should be plotted as vectors in this space. Default is False.
         - *path* = a path to save this figure to
         - *dpi* = the dpi of the saved figure
         - *width* = the width (in inches) of the saved figure
         - *height* = the height (in inches) of the saved figure
        '''
        #get 2d subspace
        ss = self.get_subspace(n=2)
        
        #calculate colours
        import matplotlib.cm as cm
        scale = 255 / (max(self.groups) + 1)
        c = cm.Set1(self.groups*scale,alpha=1)
        
        #plot scatterplot
        fig,ax = plt.subplots()
        ss.plot('PC1','PC2',kind='scatter',c=c,ax=ax)
        
        #plot vectors
        if (kwds.has_key('vectors')):
            if kwds['vectors'] == True:
                
                #calculate initial vectors (ie. identity matrix)
                axes=np.identity(len(self.data.columns))
        
                #project
                axes=self.project(axes,2)
                
                #plot
                for i,a in enumerate(axes):
                    x = [0,a[0]]
                    y = [0,a[1]]
                    ax.plot(x,y,label=self.data.columns[i])
                
                ax.legend()
                
    #private functions

Source File: utils.py From Neural-Network-Projects-with-Python with MIT License

5 votes

def preprocess(df):
    print('----------------------------------------------')
    print("Before preprocessing")
    print("Number of rows with 0 values for each variable")
    for col in df.columns:
        missing_rows = df.loc[df[col]==0].shape[0]
        print(col + ": " + str(missing_rows))
    print('----------------------------------------------')

    # Replace 0 values with the mean of the existing values
    df['Glucose'] = df['Glucose'].replace(0, np.nan)
    df['BloodPressure'] = df['BloodPressure'].replace(0, np.nan)
    df['SkinThickness'] = df['SkinThickness'].replace(0, np.nan)
    df['Insulin'] = df['Insulin'].replace(0, np.nan)
    df['BMI'] = df['BMI'].replace(0, np.nan)
    df['Glucose'] = df['Glucose'].fillna(df['Glucose'].mean())
    df['BloodPressure'] = df['BloodPressure'].fillna(df['BloodPressure'].mean())
    df['SkinThickness'] = df['SkinThickness'].fillna(df['SkinThickness'].mean())
    df['Insulin'] = df['Insulin'].fillna(df['Insulin'].mean())
    df['BMI'] = df['BMI'].fillna(df['BMI'].mean())

    print('----------------------------------------------')
    print("After preprocessing")
    print("Number of rows with 0 values for each variable")
    for col in df.columns:
        missing_rows = df.loc[df[col]==0].shape[0]
        print(col + ": " + str(missing_rows))
    print('----------------------------------------------')

    # Standardization
    df_scaled = preprocessing.scale(df)
    df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
    df_scaled['Outcome'] = df['Outcome']
    df = df_scaled
    

    return df

Source File: dataset.py From ecg_pytorch with Apache License 2.0

5 votes

def scaling(X, sigma=0.1):
    scalingFactor = np.random.normal(loc=1.0, scale=sigma, size=(1, X.shape[1]))
    myNoise = np.matmul(np.ones((X.shape[0], 1)), scalingFactor)
    return X * myNoise

Python sklearn.preprocessing.scale() Examples