RandomForestClassifier
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
from sklearn.model_selection import train_test_split # 数据分割
from sklearn.preprocessing import Imputer # 数据缺省值处理
from sklearn.preprocessing import MinMaxScaler # 数据归一化
from sklearn.preprocessing import label_binarize # 数据二值化
from sklearn.decomposition import PCA # 降维
from sklearn.ensemble import RandomForestClassifier # 分类随机森林模型
from sklearn import metrics # ROC/AUC
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation,metrics
# 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 1.读取数据
path1 = 'path'
path2 = 'path'
dftrian = pd.read_csv(path1, header=None)
dftest = pd.read_csv(path2, header=None)
# print(df.columns) 获取特征名称
feature = u'****'
# 2.划分数据
X_train, Y_train = dftrian[list(range(8))], dftrian[8]
X_test = dftest[list(range(8))]
# 5.数据归一化
# 分类模型,经常使用的是minmaxscaler归一化,回归模型经常用standardscaler标准化
ss = MinMaxScaler() # 构建归一化模型
X_train = ss.fit_transform(X_train, Y_train) # 训练模型并转换数据
X_test = ss.transform(X_test) # 转换数据
# 6.降维(此数据集的维度比较高,我们可以做降维处理)
#pca = PCA(n_components=2) # 创建PCA模型,指定维度为2
#X_train = pca.fit_transform(X_train) # 训练模型并转换数据
#X_test = pca.transform(X_test)
# 7.随机森林模型构建及训练
# n_estimators=100决策树个数,max_depth=1每个决策树深度,random_state=0随机数种子
#forest = RandomForestClassifier(n_estimators=2000, criterion='gini', max_depth=25, random_state=5)
forest = RandomForestClassifier(n_estimators=1000,max_features=6,max_depth=25,
oob_score=True,random_state=10)
forest.fit(X_train, Y_train) # 训练模型
# 8.模型评估
score1 = forest.score(X_train, Y_train) # 准确率
print('train准确率:%.2f%%' % (score1 * 100))
#score = forest.score(X_test, Y_test) # 准确率
#print('准确率:%.2f%%' % (score * 100))
#y_pred = forest.predict(dtest)
#auc_score = roc_auc_score(y_test,y_pred)
#print('auc:%.2f%%' % (test_auc * 100))
# 9.模型预测
forest_y_score = forest.predict_proba(X_test) # prodict_proba输出概率
forest_y_score1 = forest.predict_proba(X_train)
p = forest.predict(X_test)
returnMat = np.zeros((20000,2))
for i in range(20000):
returnMat[i][0] = forest_y_score[i][0]
returnMat[i][1] = forest_y_score[i][1]
with open('C:\\\\test.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(returnMat)
SVM
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from sklearn import svm # SVM模型
from sklearn.model_selection import train_test_split # 数据分割
from sklearn.metrics import accuracy_score # 计算正确率
from sklearn.exceptions import ChangedBehaviorWarning # 警告处理
# 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 警告处理
warnings.filterwarnings('ignore', category=ChangedBehaviorWarning)
# 1.读取数据
path1 = 'path'
path2 = 'path'
dftrian = pd.read_csv(path1, header=None)
dftest = pd.read_csv(path2, header=None)
# print(df.columns) 获取特征名称
feature = u'***'
# 2.划分数据
X_train, Y_train = dftrian[list(range(8))], dftrian[8]
X_test,Y_test = dftest[list(range(8))], dftest[8]
# 4.模型构建
'''
sklearn.svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto_deprecated’, coef0=0.0,
shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None,
verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)
svm.SVC API说明:
# 功能:使用SVM分类器进行模型构建
# 参数说明:
# C: 误差项的惩罚系数,默认为1.0;一般为大于0的一个数字,C越大表示在训练过程中对于总误差的关注度越高,
也就是说当C越大的时候,对于训练集的表现会越好,但是有可能引发过度拟合的问题(overfiting)
# kernel:指定SVM内部函数的类型,可选值:linear、poly、rbf、sigmoid、precomputed(基本不用,有前提要求,要求特征属性数目和样本数目一样);默认是rbf;
# degree:当使用多项式函数作为svm内部的函数的时候,给定多项式的项数,默认为3
# gamma:当SVM内部使用poly、rbf、sigmoid的时候,核函数的系数值,当默认值为auto的时候,实际系数为1/n_features
# coef0: 当核函数为poly或者sigmoid的时候,给定的独立系数,默认为0
# probability:是否启用概率估计,默认不启动,不太建议启动
# shrinking:是否开启收缩启发式计算,默认为True
# tol: 模型构建收敛参数,当模型的的误差变化率小于该值的时候,结束模型构建过程,默认值:1e-3
# cache_size:在模型构建过程中,缓存数据的最大内存大小,默认为空,单位MB
# class_weight:给定各个类别的权重,默认为空
# max_iter:最大迭代次数,默认-1表示不限制
# decision_function_shape: 决策函数,可选值:ovo和ovr,默认为None;推荐使用ovr;
'''
clf = svm.SVC(C=1, kernel='rbf', gamma=0.05)
# gamma值越大,训练集的拟合就越好,但是会造成过拟合,导致测试集拟合变差
# gamma值越小,模型的泛化能力越好,训练集和测试集的拟合相近,但是会导致训练集出现欠拟合问题,
# 从而,准确率变低,导致测试集准确率也变低。
# 5.模型训练
clf.fit(X_train, Y_train)
# 6.模型评估:计算模型的准确率/精度
print ('训练集准确率:', accuracy_score(Y_train, clf.predict(X_train)))
print ('训练集准确率X_test:', accuracy_score(Y_test, clf.predict(X_test)))
#print (clf.score(X_test, Y_test))
#print ('测试集准确率:', accuracy_score(Y_test, clf.predict(X_test)))
# 计算决策函数的结构值以及预测值(decision_function计算的是样本x到各个分割平面的距离<也就是决策函数的值>)
print ('decision_function:\n', clf.decision_function(X_train))
print ('\npredict:\n', clf.predict(X_train))
xgboost
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings #警告处理
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split # 数据分割
from sklearn.metrics import accuracy_score # 计算正确率
from sklearn.metrics import mean_squared_error # 均方误差
import xgboost as xgb # xgboost模型
# 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 警告处理
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
# 1.读取数据
iris_feature = u'***'
path = 'path' # 数据路径
data = pd.read_csv(path, header=None)
# 2.划分数据
X, Y = data[list(range(8))], data[8]
# 把文本数据进行编码,比如a b c编码为 0 1 2; 可以通过pd.Categorical(y).categories获取index对应的原始值
#Y = pd.Categorical(Y).codes
#X = data[[0,1]] # 获取第1列和第二列
# 3.数据分割
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=0)
# 5.XGBoost将数据转换为XGBoost可用的数据类型
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test)
# 6.XGBoost模型构建
# a.参数构建
params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:linear'} # reg:linear线性回归
num_round = 2
# b.模型训练
bst = xgb.train(params, dtrain, num_round)
# c.模型保存
bst.save_model('xgb.model')
# 7.XGBoost模型预测
y_pred = bst.predict(dtest)
print("均方误差为:",mean_squared_error(Y_test, y_pred))
# 8.加载模型
bst2 = xgb.Booster()
bst2.load_model('xgb.model')
# 9.使用加载模型预测
#bst = bst2.predict(dtest)
#print("均方误差为:",mean_squared_error(Y_test, y_pred2))
# 6.模型评估:计算模型的准确率/精度
print ('测试集准确率:', accuracy_score(Y_test, bst))