把鸢尾花数据集用逻辑回归进行预测:
# 利用鸢尾花数据进行逻辑回归训练,此任务是多分类任务
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
# 1.首先获取数据集
iris = load_iris()
# 测试一下数据集和分开的标记与标签
# print(iris.keys())
X_data = iris.data
y_data = iris.target
# print(X_data)
# print(y_data)
# 2.进行数据划分,其中测试集划十分之一
# random_state=0 ,这个是自己设置的,虽然整体是随机的,但如果想要下次结果与本次的一样,则需设置相同的随机种子
X_train, X_test, Y_train, Y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=0)
# 返回得到训练集、测试集标签和训练集、测试集标记
# 3.数据标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.fit_transform(X_test)
# 4.构建模型并训练
estimator = LogisticRegression(penalty='l2', solver='newton-cg', multi_class='multinomial')
estimator.fit(X_train, Y_train)
# 5.模型估计
print("\n得出来的权重:", estimator.coef_)
print("\nLogistic Regression模型训练集的准确率:%.1f%%" % (estimator.score(X_train, Y_train)*100))
# 6.模型预测
y_predict = estimator.predict(X_test)
print("\n预测结果为:\n", y_predict)
print("\n比对真实值和预测值:\n", y_predict == Y_test)
# 预测的准确率
accuracy = metrics.accuracy_score(Y_test, y_predict)
print("\nLogistic Regression 模型测试集的正确率:%.1f%%" % (accuracy*100))
# 7.交叉验证
scores = cross_val_score(estimator, X_data, y_data, scoring=None, cv=10) # cv为迭代次数。
print("\n交叉验证的准确率:", np.round(scores, 2)) # 打印输出每次迭代的度量值(准确度)
print("\n交叉验证结果的置信区间: %0.2f%%(+/- %0.2f)" % (scores.mean()*100, scores.std() * 2)) # 获取置信区间。(也就是均值和方差)