DataWhale AI夏令营-催化反应速率预测
优化方案-使用梯度提升回归模型
梯度提升回归模型(Gradient Boosting Regressor),相较于随机森林增加了learning_rate学习率参数能够更好的减小模型过拟合的风险,并且梯度提升回归模型对超参数更为敏感,通过网格搜索、贝叶斯搜索等自动化调参方法更容易拿到高一些的分数。
目前本人使用梯度提升回归模型是拿到了0.37+,在笔记中我也为大家提供了一组参数,经过测试拿到0.35+是没有问题的。
话不多说,开始上代码。
#####################
导包
# 首先,导入库
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from rdkit.Chem import rdMolDescriptors
from rdkit import RDLogger, Chem
import numpy as np
RDLogger.DisableLog('rdApp.*')
生成分子指纹
def mfgen(mol, nBits=2048, radius=2):
# 返回分子的位向量形式的Morgan fingerprint
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
return np.array(list(map(eval, list(fp.ToBitString()))))
def vec_cpd_lst(smi_lst):
smi_set = list(set(smi_lst))
smi_vec_map = {}
for smi in tqdm(smi_set): # tqdm:显示进度条
mol = Chem.MolFromSmiles(smi)
smi_vec_map[smi] = mfgen(mol)
smi_vec_map[''] = np.zeros(2048)
vec_lst = [smi_vec_map[smi] for smi in smi_lst]
return np.array(vec_lst)
加载本地数据
dataset_dir = '../dataset' # 注:如果是在AI Studio上,将这里改为'dataset'
train_df = pd.read_csv(f'{dataset_dir}/round1_train_data.csv')
test_df = pd.read_csv(f'{dataset_dir}/round1_test_data.csv')
print(f'Training set size: {len(train_df)}, test set size: {len(test_df)}')
读取数据并转化为分子指纹
# 从csv中读取数据
train_rct1_smi = train_df['Reactant1'].to_list()
train_rct2_smi = train_df['Reactant2'].to_list()
train_add_smi = train_df['Additive'].to_list()
train_sol_smi = train_df['Solvent'].to_list()
# 将SMILES转化为分子指纹
train_rct1_fp = vec_cpd_lst(train_rct1_smi)
train_rct2_fp = vec_cpd_lst(train_rct2_smi)
train_add_fp = vec_cpd_lst(train_add_smi)
train_sol_fp = vec_cpd_lst(train_sol_smi)
# 在dim=1维度进行拼接。即:将一条数据的Reactant1, Reactant2, Additive, Solvent字段的morgan fingerprint拼接为一个向量。
train_x = np.concatenate([train_rct1_fp, train_rct2_fp, train_add_fp, train_sol_fp], axis=1)
train_y = train_df['Yield'].to_numpy()
# 测试集也进行同样的操作
test_rct1_smi = test_df['Reactant1'].to_list()
test_rct2_smi = test_df['Reactant2'].to_list()
test_add_smi = test_df['Additive'].to_list()
test_sol_smi = test_df['Solvent'].to_list()
test_rct1_fp = vec_cpd_lst(test_rct1_smi)
test_rct2_fp = vec_cpd_lst(test_rct2_smi)
test_add_fp = vec_cpd_lst(test_add_smi)
test_sol_fp = vec_cpd_lst(test_sol_smi)
test_x = np.concatenate([test_rct1_fp, test_rct2_fp, test_add_fp, test_sol_fp], axis=1)
这里我还是使用的传统的网格搜索的方式,大家感兴趣的话可以尝试其他方法。我这里也为大家提供一组我测试过的参数,拿到0.35+肯定没问题:
Best parameters found: {‘learning_rate’: 0.05, ‘max_depth’: 20, ‘max_features’: ‘sqrt’, ‘min_samples_leaf’: 2, ‘min_samples_split’: 5, ‘n_estimators’: 300}
param_grid = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [8, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'max_features': ['sqrt']
}
开始训练,这个部分是我血与泪的教训,建议大家无论用什么条参方法千万少写参数范围,因为官方提供的阿里云机器跑太久了总会出现奇怪的错误导致你白跑。如果你自己本地部署了当我没说。但是还是要提醒大家,梯度提升回归模型训练需要的时间会比随机森林长一些,大家自己斟酌。
# 实例化梯度提升回归模型
model = GradientBoostingRegressor()
# 使用GridSearchCV进行超参数调优
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train_x, train_y)
打印最优参数
# 输出最佳参数
print("最优结果:", grid_search.best_params_)
# 使用最佳参数训练模型
best_model = grid_search.best_estimator_
保存模型推理预测
# 保存模型
with open('./gradient_boosting_model.pkl', 'wb') as file:
pickle.dump(best_model, file)
# 加载模型
with open('gradient_boosting_model.pkl', 'rb') as file:
loaded_model = pickle.load(file)
# 推理预测
test_pred = loaded_model.predict(test_x)
生成赛事提交文件
ans_str_lst = ['rxnid,Yield']
for idx, y in enumerate(test_pred):
ans_str_lst.append(f'test{idx+1},{y:.4f}')
with open('./submit.txt', 'w') as fw:
fw.writelines('\n'.join(ans_str_lst))
一站式运行
在mp/code下新建GBR_GS.py文件,在终端运行python GBR_GS.py等待结果即可(这份代码里的网格参数我只给出了一组,经过我测试是0.35+没问题,大家也可以在此基础上继续调参)
# 首先,导入库
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from rdkit.Chem import rdMolDescriptors
from rdkit import RDLogger, Chem
import numpy as np
RDLogger.DisableLog('rdApp.*')
def mfgen(mol, nBits=2048, radius=2):
# 返回分子的位向量形式的Morgan fingerprint
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
return np.array(list(map(eval, list(fp.ToBitString()))))
# 加载数据
def vec_cpd_lst(smi_lst):
smi_set = list(set(smi_lst))
smi_vec_map = {}
for smi in tqdm(smi_set): # tqdm:显示进度条
mol = Chem.MolFromSmiles(smi)
smi_vec_map[smi] = mfgen(mol)
smi_vec_map[''] = np.zeros(2048)
vec_lst = [smi_vec_map[smi] for smi in smi_lst]
return np.array(vec_lst)
dataset_dir = '../dataset' # 注:如果是在AI Studio上,将这里改为'dataset'
train_df = pd.read_csv(f'{dataset_dir}/round1_train_data.csv')
test_df = pd.read_csv(f'{dataset_dir}/round1_test_data.csv')
print(f'Training set size: {len(train_df)}, test set size: {len(test_df)}')
# 从csv中读取数据
train_rct1_smi = train_df['Reactant1'].to_list()
train_rct2_smi = train_df['Reactant2'].to_list()
train_add_smi = train_df['Additive'].to_list()
train_sol_smi = train_df['Solvent'].to_list()
# 将SMILES转化为分子指纹
train_rct1_fp = vec_cpd_lst(train_rct1_smi)
train_rct2_fp = vec_cpd_lst(train_rct2_smi)
train_add_fp = vec_cpd_lst(train_add_smi)
train_sol_fp = vec_cpd_lst(train_sol_smi)
# 在dim=1维度进行拼接。即:将一条数据的Reactant1, Reactant2, Additive, Solvent字段的morgan fingerprint拼接为一个向量。
train_x = np.concatenate([train_rct1_fp, train_rct2_fp, train_add_fp, train_sol_fp], axis=1)
train_y = train_df['Yield'].to_numpy()
# 测试集也进行同样的操作
test_rct1_smi = test_df['Reactant1'].to_list()
test_rct2_smi = test_df['Reactant2'].to_list()
test_add_smi = test_df['Additive'].to_list()
test_sol_smi = test_df['Solvent'].to_list()
test_rct1_fp = vec_cpd_lst(test_rct1_smi)
test_rct2_fp = vec_cpd_lst(test_rct2_smi)
test_add_fp = vec_cpd_lst(test_add_smi)
test_sol_fp = vec_cpd_lst(test_sol_smi)
test_x = np.concatenate([test_rct1_fp, test_rct2_fp, test_add_fp, test_sol_fp], axis=1)
# 定义参数网格
param_grid = {
'n_estimators': [300,],
'learning_rate': [0.05,],
'max_depth': [20,],
'min_samples_split': [5, ],
'min_samples_leaf': [2, ],
'max_features': ['sqrt']
}
# 实例化梯度提升回归模型
model = GradientBoostingRegressor()
# 使用GridSearchCV进行超参数调优
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(train_x, train_y)
# 输出最佳参数
print("最优参数: ", grid_search.best_params_)
# 使用最佳参数训练模型
best_model = grid_search.best_estimator_
# 保存模型
with open('./gradient_boosting_model.pkl', 'wb') as file:
pickle.dump(best_model, file)
# 加载模型
with open('gradient_boosting_model.pkl', 'rb') as file:
loaded_model = pickle.load(file)
# 预测\推理
test_pred = loaded_model.predict(test_x)
ans_str_lst = ['rxnid,Yield']
for idx, y in enumerate(test_pred):
ans_str_lst.append(f'test{idx+1},{y:.4f}')
with open('./submit.txt', 'w') as fw:
fw.writelines('\n'.join(ans_str_lst))
加油吧,祝你取得心仪的分数哦,后面我可能会继续尝试新模型,因为特征工程做起来感觉比较麻烦,暂时不想考虑(打工人没时间,唉!)。