给定包含了5维信息的训练数据,训练出一个贝叶斯分类器和Naive Bayes分类器对测试数据进行二分类
Bayes Decision Rule
import numpy as np
from scipy.io import loadmat, savemat
data_train = loadmat('data_train.mat')['data_train']
label_train = loadmat('label_train.mat')['label_train'].ravel() # Flatten to 1D array
data_test = loadmat('data_test.mat')['data_test']
prior_prob_0 = np.mean(label_train == -1)
prior_prob_1 = np.mean(label_train == 1)
mean_0 = np.mean(data_train[label_train == -1], axis=0)
var_0 = np.var(data_train[label_train == -1], axis=0)
mean_1 = np.mean(data_train[label_train == 1], axis=0)
var_1 = np.var(data_train[label_train == 1], axis=0)
cov_0 = np.cov(data_train[label_train == -1], rowvar=False)
cov_1 = np.cov(data_train[label_train == 1], rowvar=False)
# Define Gaussion function and use mean and variance to calculate the probability density
def gaussian_pdf(x, mean, var):
return (1.0 / np.sqrt(2.0 * np.pi * var)) * np.exp(- (x - mean) ** 2 / (2 * var))
# Predict
y_pred = []
for sample in data_test:
class_0_prob = np.log(prior_prob_0)
class_1_prob = np.log(prior_prob_1)
for i in range(sample.shape[0]):
class_0_prob += np.log(gaussian_pdf(sample[i], mean_0[i], var_0[i]))
class_1_prob += np.log(gaussian_pdf(sample[i], mean_1[i], var_1[i]))
if class_0_prob > class_1_prob:
y_pred.append(-1)
else:
y_pred.append(1)
y_pred = np.array(y_pred)
# Save results
label_test = {'BDR_label_test': y_pred.reshape(-1, 1)} # Reshape to (n_samples, 1)
savemat('BDR_label_test.mat', label_test) # Save to BB_label_test.mat
with open('BDR_parameters.txt', 'w', encoding='utf-8') as f:
f.write("Model parameters:\n")
f.write("Mean class -1:\n")
np.savetxt(f, mean_0.reshape(1, -1), fmt='%s')
f.write("Mean class 1:\n")
np.savetxt(f, mean_1.reshape(1, -1), fmt='%s')
f.write("\nVariance class -1:\n")
np.savetxt(f, var_0.reshape(1, -1), fmt='%s')
f.write("\nVariance class 1:\n")
np.savetxt(f, var_1.reshape(1, -1), fmt='%s')
f.write("\nCovariance class -1:\n")
np.savetxt(f, cov_0, fmt='%0.4f')
f.write("\nCovariance class 1:\n")
np.savetxt(f, cov_1, fmt='%0.4f')
通过计算训练数据中class为1和-1的均值和方差,通过高斯分布得到每个类别的条件概率和先验概率,然后将待预测数据代入到每个类别的概率密度函数中,计算最大后验概率来确定待预测数据应该归属于哪一个类别。
Naive Bayes
from sklearn.naive_bayes import GaussianNB
import numpy as np
from scipy.io import loadmat, savemat
data_train = loadmat('data_train.mat')['data_train']
label_train = loadmat('label_train.mat')['label_train'].ravel() # Flatten to 1D array
data_test = loadmat('data_test.mat')['data_test']
# Gaussian Function of Naive Bayes
gnb = GaussianNB()
# Train
gnb.fit(data_train, label_train)
# Predict
y_pred = gnb.predict(data_test)
print("Model parameters:")
print("Mean:", gnb.theta_)
print("Variance:", gnb.var_)
covariances = {}
for class_label in np.unique(label_train):
class_data = data_train[label_train == class_label]
covariances[f'covariance_class_{class_label}'] = np.cov(class_data, rowvar=False)
with open('NB_parameters.txt', 'w', encoding='utf-8') as f:
f.write("Model parameters:\n")
f.write("Mean:\n")
np.savetxt(f, gnb.theta_, fmt='%s')
f.write("\nVariance:\n")
np.savetxt(f, gnb.var_, fmt='%s')
for key, cov_mat in covariances.items():
f.write(f"{key}:\n")
np.savetxt(f, cov_mat, fmt='%0.4f', newline='\n')
n_samples, n_features = data_test.shape
y_pred_reshaped = np.tile(y_pred[:, np.newaxis], 1)
label_test = {'NB_label_test': y_pred_reshaped}
savemat('NB_label_test.mat', label_test)
print("Predicted labels saved to label_test.mat.")
大致方法与上面的Bayes Decision Rule相同,不同的是在这里我直接调用了Naive Bayes的函数。最后输出的预测结果与Bayes Decision Rule相同,导致这一结果的主要原因是教授给的测试数据的数据与数据之间的协方差很小,而Navie Bayes和Bayes Decision Rule之间最大的区别就是Naive Bayes忽略了数据与数据之间的影响,其假设所有的特征相互独立,互相不受影响的。因此当测试数据之间的协方差很小时,这两种方法进行分类的结果将会基本一样。