作者:新疆盛苑烟酒特产_485 | 来源:互联网 | 2023-09-16 10:49
一、Filter1.VarianceThreshold方差选择法使用方差选择法,先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。使用feature_selection库的Va
一、Filter
1. VarianceThreshold 方差选择法
使用方差选择法,先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。使用feature_selection库的VarianceThreshold类来选择特征的代码如下:
方差选择法,返回值为特征选择后的数据;参数threshold为方差的阈值
from sklearn.datasets import load_iris
iris = load_iris()
#print(VarianceThreshold(threshold=3).fit_transform(iris.data))
print(iris.data[0:5])
selector = VarianceThreshold(threshold=3).fit(iris.data, iris.target)
data = selector.transform(iris.data)
print(data[0:5])
print(selector.variances_)
2. 计算Pearson系数
Pearson描述的是两个变量间线性相关强弱的程度。r的取值在-1与+1之间,若r>0,表明两个变量是正相关,即一个变量的值越大,另一个变量的值也会越大;若r<0,表明两个变量是负相关,即一个变量的值越大另一个变量的值反而会越小。r 的绝对值越大表明相关性越强,要注意的是这里并不存在因果关系。
简单的相关系数的分类
- 0.8-1.0 极强相关
- 0.6-0.8 强相关
- 0.4-0.6 中等程度相关
- 0.2-0.4 弱相关
- 0.0-0.2 极弱相关或无相关
皮尔逊系数可以用Numpy中的corrcoef计算,如下:
#计算协方差
def calculate_corrcoef(x,y):
corr_values = []
float_col = list(x.columns)
for col in float_col:
corr_values.append(abs(np.corrcoef(x[col].values.astype(float),y)[0,1]))
corr_df = pd.DataFrame({'col':float_col,'corr_values':corr_values})
corr_df = corr_df.sort_values(by='corr_values',ascending=False)
return corr_df
#计算pearson系数
def calculate_p(x,y):
corr_df = calculate_corrcoef(x,y)
corr2 = corr_df[corr_df.corr_values>=0.1] #保留与label相关系数大于0.1的特征
corr2_col = corr2['col'].values.tolist()
return x[corr2_col]
x_train = calculate_p(x_train,y_train)
3.去掉方差很小的列
方法1:
#去掉方差为0的列
def drop_repeat_col(data):
#方差选择法
#使用方差选择法,先要计算各个特征的方差,然后根据阈值,选择方差大于阈值的特征。
#使用feature_selection库的VarianceThreshold类来选择特征的代码如下:
#方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值
selector = VarianceThreshold(threshold=0.01)
data = selector.fit_transform(data)
# print(data.shape)
print(selector.variances_)
方法2:
#去掉数字相同的列以及日期列
def remove_waste_col(data):
columns = list(data.columns)
not_data_col = []
for col in columns:
max_num = data[col].max()
if max_num != data[col].min() and max_num <1e13 and str(max_num).find('2017') == -1 and str(max_num).find('2016') == -1:
not_data_col.append(col)
return data[not_data_col]
4. 去掉缺失值很多的行、列
#去掉缺失值大的行
def miss_row(data):
miss_row = data.isnull().sum(axis=1).reset_index()
miss_row.columns = ['row','miss_count']
miss_row_value = miss_row[miss_row.miss_count>500].row.values
data.drop(miss_row_value,axis=0,inplace=True)
return data
#去掉缺失值大的列
def miss_col(data):
miss_col= data.isnull().sum(axis=0).reset_index()
miss_col.columns = ['col','miss_count']
miss_col_value = miss_col[miss_col.miss_count>200].col.values
data.drop(miss_col_value,axis=1,inplace=True)
return data
5. 去掉日期列
def drop_date(data):
columns = list(data.columns)
not_date_columns = []
for column in columns:
tmp_num = data[column].max()
if str(tmp_num).find('2017') == -1 and str(tmp_num).find('2016') == -1:
not_date_columns.append(column)
# print(len(not_date_columns))
return data[not_date_columns]
6. 利用KNN填补空值
#用KNN填充空值
def knn_fill_nan(data,K):
#计算每一行的空值,如果有空值,就进行填充;没有空值的行用于做训练数据
data_row = data.isnull().sum(axis=1).reset_index()
data_row.columns = ['raw_row','nan_count']
#空值行(需要填充的行)
data_row_nan = data_row[data_row.nan_count>0].raw_row.values
#非空行,原始数据
data_no_nan = data.drop(data_row_nan,axis=0)
#空行,原始数据
data_nan = data.loc[data_row_nan]
for row in data_row_nan:
data_row_need_fill = data_nan.loc[row]
#找出空列,并用非空列做KNN
data_col_index = data_row_need_fill.isnull().reset_index()
data_col_index.columns = ['col','is_null']
is_null_col = data_col_index[data_col_index.is_null == 1].col.values
data_col_no_nan_index = data_col_index[data_col_index.is_null == 0].col.values
#保存需要填充的行的非空列
data_row_fill = data_row_need_fill[data_col_no_nan_index]
#广播,矩阵-向量
data_diff = data_no_nan[data_col_no_nan_index] - data_row_need_fill[data_col_no_nan_index]
#求欧式距离
data_diff = (data_diff ** 2).sum(axis=1)
data_diff = data_diff.apply(lambda x:np.sqrt(x))
data_diff = data_diff.reset_index()
data_diff.columns = ['raw_row','diff_val']
data_diff_sum = data_diff.sort_values(by='diff_val',ascending=True)
data_diff_sum_sorted = data_diff_sum.reset_index()
#取出k个距离最近的row
top_k_diff_val = data_diff_sum_sorted.loc[0:K-1].raw_row.values
#根据row和col值确定需要填充的数据的具体位置(可能是多个)
#填充的数据为最近的K个值的平均值
top_k_diff_val = data.loc[top_k_diff_val][is_null_col].sum(axis=0)/K
#将计算出来的列添加至非空列
data_row_fill = pd.concat([data_row_fill,pd.DataFrame(top_k_diff_val)]).T
data_no_nan = data_no_nan.append(data_row_fill,ignore_index=True)
print('填补完成')
return data_no_nan
7. 去掉object、int类型的列
#去掉object,int的列
def drop_non_number(data):
data_types = data.dtypes.reset_index()
data_types.columns = ['col','dtype']
data_object = data_types[data_types.dtype=='object'].col.values
data_object = data[data_object]
data_object.to_csv('non_number.csv',index=False)
col_val = data_types[data_types.dtype == 'float64'].col.values
return data[col_val]
8.计算偏度和峰度
(1)pandas计算偏度、峰度
import pandas as pd
x = [53, 61, 49, 66, 78, 47]
s = pd.Series(x)
print(s.skew())
print(s.kurt())
它是用上面的G1">G1来计算偏度G2">G2来计算峰度,结果如下:
0.7826325504212567
-0.2631655441038463
(2)dataframe格式的数据计算偏度
data_frame2= pd.DataFrame({'A':[1,2,3,4],'b':[1,0,3,1]})
data_frame2.skew(axis = 0)
A 0.000000
b 1.129338
dtype: float64
skew(axis=None, skipna=None, level=None, numeric_Only=None, **kwargs)
- axis : {index (0), columns (1)}
- skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA
- level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a Series
- numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series.
二、Wrapper
三、Embedded
1. RF、Xgboost、ExtraTree每个选出topk特征,再进行融合
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
def get_top_k_feature(features,model,top_n_features):
feature_imp_sorted_rf = pd.DataFrame({'feature':features,'importance':model.best_estimator_.feature_importances_}).sort_values('importance',ascending='False')
features_top_n = feature_imp_sorted_rf.head(top_n_features)['feature']
return features_top_n
def ensemble_model_feature(X,Y,top_n_features):
features = list(X)
#随机森林
rf = ensemble.RandomForestRegressor()
rf_param_grid = {'n_estimators':[900],'random_state':[2,4,6,8]}
rf_grid = GridSearchCV(rf,rf_param_grid,cv=10,verbose=1,n_jobs=25)
rf_grid.fit(X,Y)
top_n_features_rf = get_top_k_feature(features=features,model=rf_grid,top_n_features=top_n_features)
print('RF 选择完毕')
#Adaboost
abr = ensemble.AdaBoostRegressor()
abr_grid = GridSearchCV(abr,rf_param_grid,cv=10,n_jobs=25)
abr_grid.fit(X,Y)
top_n_features_bgr = get_top_k_feature(features=features,model=abr_grid,top_n_features=top_n_features)
print('Adaboost选择完毕')
#ExtraTree
etr = ensemble.ExtraTreesRegressor()
etr_grid = GridSearchCV(etr,rf_param_grid,cv=10,n_jobs=25)
etr_grid.fit(X,Y)
top_n_features_etr = get_top_k_feature(features=features,model=etr_grid,top_n_features=top_n_features)
print('ExtraTree选择完毕')
#融合以上3个模型
features_top_n = pd.concat([top_n_features_rf,top_n_features_bgr,top_n_features_etr],ignore_index=True).drop_duplicates()
print(features_top_n)
print(len(features_top_n))
return features_top_n
参考文献:
【1】特征选择之方差选择法VarianceThreshold
【2】Python中如何计算Pearson相关性和显著性?
【3】python 皮尔森相关系数