- 导入相关的包
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
- 导入数据
df = pd.read_csv("./datas/titanic/titanic_train.csv")
df = df[["PassengerId", "Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]].copy()
df.head()
- 数据清理和转换
df.info()
df["Age"] = df["Age"].fillna(df["Age"].median())
df.Sex.unique()
df.loc[df["Sex"] == "male", "Sex"] = 0
df.loc[df["Sex"] == "female", "Sex"] = 1
df.Embarked.unique()
df["Embarked"] = df["Embarked"].fillna(0)
df.loc[df["Embarked"] == "S", "Embarked"] = 1
df.loc[df["Embarked"] == "C", "Embarked"] = 2
df.loc[df["Embarked"] == "Q", "Embarked"] = 3
df.head()
- 将特征列和结果列拆分开
y = df.pop("Survived")
X = df
X.head()
y.head()
- 使用卡方检验选择topK的特征
bestfeatures = SelectKBest(score_func=chi2, k=len(X.columns))
fit = bestfeatures.fit(X, y)
- 按照重要性顺序打印特征列表
df_scores = pd.DataFrame(fit.scores_)
df_scores
df_columns = pd.DataFrame(X.columns)
df_columns
df_feature_scores = pd.concat([df_columns,df_scores],axis=1)
df_feature_scores.columns = ['feature_name','Score']
df_feature_scores.sort_values(by="Score", ascending=False)
df_feature_scores