导入必要的库
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
导入数据
data = pd.read_csv('./*.csv')
基于相关性进行特征选择
# 获取相关系数矩阵
corr = data.corr()
corr.head()
# 绘制相关系数热力图
sns.heatmap(corr)
def select_corr(data, threshold):
"""
两种特征之间的相关系数很高,则表明特征冗余,可去除其中一种特征
data: 特征数据
threshold: 相关系数阈值,高于该阈值的去除其中一种特征
return 删减后的特征
"""
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= threshold:
if columns[j]:
columns[j] = False
selected_columns = data.columns[columns]
return data[selected_columns]
基于P-value的特征选择
import statsmodels.api as sm
def backwardElimination(x, Y, sl, columns):
"""
逐渐删去p值最大的列
x: 特征数据
Y: 标签
sl: 阈值
columns: 列名称数组
"""
numVars = len(x[0]) # number of features
for i in range(0, numVars):
regressor_OLS = sm.OLS(Y, x).fit()
maxVar = max(regressor_OLS.pvalues).astype(float) # 获取最大P值
if maxVar > sl:
for j in range(0, numVars - i):
if (regressor_OLS.pvalues[j].astype(float) == maxVar):
x = np.delete(x, j, 1) # 删除第j列
columns = np.delete(columns, j)
print(regressor_OLS.summary()) # 打印报告
return x, columns