使用方差选择法,先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。使用feature_selection库的VarianceThreshold类来选择特征的代码如下:
方差选择法,返回值为特征选择后的数据;参数threshold为方差的阈值
from sklearn.datasets import load_iris iris = load_iris() #print(VarianceThreshold(threshold=3).fit_transform(iris.data)) print(iris.data[0:5]) selector = VarianceThreshold(threshold=3).fit(iris.data, iris.target) data = selector.transform(iris.data) print(data[0:5]) print(selector.variances_)
Pearson描述的是两个变量间线性相关强弱的程度。r的取值在-1与+1之间,若r>0,表明两个变量是正相关,即一个变量的值越大,另一个变量的值也会越大;若r<0,表明两个变量是负相关,即一个变量的值越大另一个变量的值反而会越小。r 的绝对值越大表明相关性越强,要注意的是这里并不存在因果关系。
#计算协方差 def calculate_corrcoef(x,y): corr_values = [] float_col = list(x.columns) for col in float_col: corr_values.append(abs(np.corrcoef(x[col].values.astype(float),y)[0,1])) corr_df = pd.DataFrame({'col':float_col,'corr_values':corr_values}) corr_df = corr_df.sort_values(by='corr_values',ascending=False) return corr_df #计算pearson系数 def calculate_p(x,y): corr_df = calculate_corrcoef(x,y) corr2 = corr_df[corr_df.corr_values>=0.1] #保留与label相关系数大于0.1的特征 corr2_col = corr2['col'].values.tolist() return x[corr2_col] x_train = calculate_p(x_train,y_train)
方法1:
#去掉方差为0的列 def drop_repeat_col(data): #方差选择法 #使用方差选择法,先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。 #使用feature_selection库的VarianceThreshold类来选择特征的代码如下: #方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值 selector = VarianceThreshold(threshold=0.01) data = selector.fit_transform(data) # print(data.shape) print(selector.variances_)
方法2:
#去掉数字相同的列以及日期列 def remove_waste_col(data): columns = list(data.columns) not_data_col = [] for col in columns: max_num = data[col].max() if max_num != data[col].min() and max_num <1e13 and str(max_num).find('2017') == -1 and str(max_num).find('2016') == -1: not_data_col.append(col) return data[not_data_col]
#去掉缺失值大的行 def miss_row(data): miss_row = data.isnull().sum(axis=1).reset_index() miss_row.columns = ['row','miss_count'] miss_row_value = miss_row[miss_row.miss_count>500].row.values data.drop(miss_row_value,axis=0,inplace=True) return data
#去掉缺失值大的列 def miss_col(data): miss_col= data.isnull().sum(axis=0).reset_index() miss_col.columns = ['col','miss_count'] miss_col_value = miss_col[miss_col.miss_count>200].col.values data.drop(miss_col_value,axis=1,inplace=True) return data
def drop_date(data): columns = list(data.columns) not_date_columns = [] for column in columns: tmp_num = data[column].max() if str(tmp_num).find('2017') == -1 and str(tmp_num).find('2016') == -1: not_date_columns.append(column) # print(len(not_date_columns)) return data[not_date_columns]
#用KNN填充空值 def knn_fill_nan(data,K): #计算每一行的空值,如果有空值,就进行填充;没有空值的行用于做训练数据 data_row = data.isnull().sum(axis=1).reset_index() data_row.columns = ['raw_row','nan_count'] #空值行(需要填充的行) data_row_nan = data_row[data_row.nan_count>0].raw_row.values #非空行,原始数据 data_no_nan = data.drop(data_row_nan,axis=0) #空行,原始数据 data_nan = data.loc[data_row_nan] for row in data_row_nan: data_row_need_fill = data_nan.loc[row] #找出空列,并用非空列做KNN data_col_index = data_row_need_fill.isnull().reset_index() data_col_index.columns = ['col','is_null'] is_null_col = data_col_index[data_col_index.is_null == 1].col.values data_col_no_nan_index = data_col_index[data_col_index.is_null == 0].col.values #保存需要填充的行的非空列 data_row_fill = data_row_need_fill[data_col_no_nan_index] #广播,矩阵-向量 data_diff = data_no_nan[data_col_no_nan_index] - data_row_need_fill[data_col_no_nan_index] #求欧式距离 data_diff = (data_diff ** 2).sum(axis=1) data_diff = data_diff.apply(lambda x:np.sqrt(x)) data_diff = data_diff.reset_index() data_diff.columns = ['raw_row','diff_val'] data_diff_sum = data_diff.sort_values(by='diff_val',ascending=True) data_diff_sum_sorted = data_diff_sum.reset_index() #取出k个距离最近的row top_k_diff_val = data_diff_sum_sorted.loc[0:K-1].raw_row.values #根据row和col值确定需要填充的数据的具体位置(可能是多个) #填充的数据为最近的K个值的平均值 top_k_diff_val = data.loc[top_k_diff_val][is_null_col].sum(axis=0)/K #将计算出来的列添加至非空列 data_row_fill = pd.concat([data_row_fill,pd.DataFrame(top_k_diff_val)]).T data_no_nan = data_no_nan.append(data_row_fill,ignore_index=True) print('填补完成') return data_no_nan
#去掉object,int的列 def drop_non_number(data): data_types = data.dtypes.reset_index() data_types.columns = ['col','dtype'] data_object = data_types[data_types.dtype=='object'].col.values data_object = data[data_object] data_object.to_csv('non_number.csv',index=False) col_val = data_types[data_types.dtype == 'float64'].col.values return data[col_val]
(1)pandas计算偏度、峰度
import pandas as pd x = [53, 61, 49, 66, 78, 47] s = pd.Series(x) print(s.skew()) print(s.kurt())
它是用上面的
0.7826325504212567 -0.2631655441038463
(2)dataframe格式的数据计算偏度
data_frame2= pd.DataFrame({'A':[1,2,3,4],'b':[1,0,3,1]}) data_frame2.skew(axis = 0) A 0.000000 b 1.129338 dtype: float64
skew(axis=None, skipna=None, level=None, numeric_Only=None, **kwargs)
二、Wrapper
三、Embedded
from sklearn import ensemble from sklearn.model_selection import GridSearchCV import xgboost as xgb def get_top_k_feature(features,model,top_n_features): feature_imp_sorted_rf = pd.DataFrame({'feature':features,'importance':model.best_estimator_.feature_importances_}).sort_values('importance',ascending='False') features_top_n = feature_imp_sorted_rf.head(top_n_features)['feature'] return features_top_n def ensemble_model_feature(X,Y,top_n_features): features = list(X) #随机森林 rf = ensemble.RandomForestRegressor() rf_param_grid = {'n_estimators':[900],'random_state':[2,4,6,8]} rf_grid = GridSearchCV(rf,rf_param_grid,cv=10,verbose=1,n_jobs=25) rf_grid.fit(X,Y) top_n_features_rf = get_top_k_feature(features=features,model=rf_grid,top_n_features=top_n_features) print('RF 选择完毕') #Adaboost abr = ensemble.AdaBoostRegressor() abr_grid = GridSearchCV(abr,rf_param_grid,cv=10,n_jobs=25) abr_grid.fit(X,Y) top_n_features_bgr = get_top_k_feature(features=features,model=abr_grid,top_n_features=top_n_features) print('Adaboost选择完毕') #ExtraTree etr = ensemble.ExtraTreesRegressor() etr_grid = GridSearchCV(etr,rf_param_grid,cv=10,n_jobs=25) etr_grid.fit(X,Y) top_n_features_etr = get_top_k_feature(features=features,model=etr_grid,top_n_features=top_n_features) print('ExtraTree选择完毕') #融合以上3个模型 features_top_n = pd.concat([top_n_features_rf,top_n_features_bgr,top_n_features_etr],ignore_index=True).drop_duplicates() print(features_top_n) print(len(features_top_n)) return features_top_n
参考文献:
【1】特征选择之方差选择法VarianceThreshold
【2】Python中如何计算Pearson相关性和显著性?
【3】python 皮尔森相关系数