import sklearn
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import psutil
import torch
import torch.nn as nn
import seaborn as sns
from Transformer import TransformerRegressor,EnhancedHybridTransformerRegressor
sns.set_style("whitegrid")
sns.set_palette("husl")
try:
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
except:
from matplotlib.font_manager import fontManager
fontManager.addfont('C:/Windows/Fonts/msyh.ttc')
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
def print_memory_usage():
process = psutil.Process()
print(f"内存占用: {process.memory_info().rss / 1024 ** 2:.2f} MB")
df = pd.read_csv('1_encoded.csv')
num_size = 0.7
outdim = 1
num_samples = df.shape[0]
random_indices = np.random.permutation(num_samples)
df = df.iloc[random_indices, :]
num_train_s = round(num_size * num_samples)
f_ = df.shape[1] - outdim
P_train = df.iloc[:num_train_s, :f_].values
T_train = df.iloc[:num_train_s, f_:].values.reshape(-1, 1)
P_test = df.iloc[num_train_s:, :f_].values
T_test = df.iloc[num_train_s:, f_:].values.reshape(-1, 1)
scaler = StandardScaler()
P_train_scaled = scaler.fit_transform(P_train)
P_test_scaled = scaler.transform(P_test)
train_X = torch.tensor(P_train_scaled.reshape(-1, 1, f_), dtype=torch.float32)
train_Y = torch.tensor(T_train, dtype=torch.float32)
test_X = torch.tensor(P_test_scaled.reshape(-1, 1, f_), dtype=torch.float32)
test_Y = torch.tensor(T_test, dtype=torch.float32)
train_loader = DataLoader(TensorDataset(train_X, train_Y),
batch_size=64,
shuffle=True,
pin_memory=True)
model = TransformerRegressor(f_, outdim)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.MSELoss()
for epoch in range(100):
model.train()
for batch_X, batch_y in train_loader:
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
if epoch % 10 == 0:
print(f'Epoch {epoch}, Loss: {loss.item():.4f}')
print_memory_usage()
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'input_dim': f_,
'output_dim': outdim,
'scaler_mean': scaler.mean_,
'scaler_scale': scaler.scale_
}, 'model_1000.pth')
model.eval()
with torch.no_grad():
test_pred = model(test_X).numpy().flatten()
results_df = pd.DataFrame({
'True_Value': T_test.flatten(),
'Predicted_Value': test_pred
})
results_df.to_csv('2_comparison_results.csv', index=False, float_format='%.6f')
errors = T_test.flatten() - test_pred
metrics = {
"MAE": mean_absolute_error(T_test, test_pred),
"MSE": mean_squared_error(T_test, test_pred),
"RMSE": np.sqrt(mean_squared_error(T_test, test_pred)),
"R²": r2_score(T_test, test_pred),
}
fig = plt.figure(figsize=(18, 12))
plt.rc('font', size=10)
ax1 = plt.subplot2grid((3, 2), (0, 0), colspan=2)
sample_indices = np.arange(len(T_test))
ax1.plot(sample_indices, T_test, 'b-', alpha=0.6, label='真实值')
ax1.plot(sample_indices, test_pred, 'r--', alpha=0.8, label='预测值')
ax1.set_title('预测值与真实值趋势对比', fontsize=12, pad=10)
ax1.set_xlabel('样本序号', fontsize=10)
ax1.set_ylabel('目标值', fontsize=10)
ax1.legend(loc='upper right', frameon=False)
ax1.grid(True, linestyle='--', alpha=0.6)
ax2 = plt.subplot2grid((3, 2), (1, 0))
sc = ax2.scatter(T_test, test_pred, c=errors, cmap='coolwarm',
alpha=0.7, edgecolors='none', vmin=-np.abs(errors).max(),
vmax=np.abs(errors).max())
plt.colorbar(sc, ax=ax2, label='预测误差')
lims = [np.min([ax2.get_xlim(), ax2.get_ylim()]),
np.max([ax2.get_xlim(), ax2.get_ylim()])]
ax2.plot(lims, lims, 'k--', alpha=0.5, lw=2)
sns.regplot(x=T_test.flatten(), y=test_pred, ax=ax2,
scatter=False, color='orange', line_kws={'lw':1.5})
ax2.set_title(f'预测值 vs 真实值 (R²={metrics["R²"]:.3f})', fontsize=12)
ax2.set_xlabel('真实值', fontsize=10)
ax2.set_ylabel('预测值', fontsize=10)
ax3 = plt.subplot2grid((3, 2), (1, 1))
sns.histplot(errors, kde=True, ax=ax3, color='purple',
bins=30, alpha=0.5, edgecolor='w')
ax3.axvline(0, color='gray', linestyle='--', alpha=0.8)
ax3.set_title('预测误差分布', fontsize=12)
ax3.set_xlabel('预测误差', fontsize=10)
ax3.set_ylabel('密度', fontsize=10)
ax4 = plt.subplot2grid((3, 2), (2, 0), colspan=2)
cell_text = [[f"{v:.4f}" for v in metrics.values()]]
table = ax4.table(cellText=cell_text,
colLabels=list(metrics.keys()),
loc='center',
cellLoc='center',
bbox=[0.2, 0, 0.6, 1])
table.auto_set_font_size(False)
table.set_fontsize(12)
ax4.axis('off')
plt.tight_layout()
plt.subplots_adjust(hspace=0.4, wspace=0.3)
plt.savefig('3_analysis_dashboard.png', dpi=150, bbox_inches='tight')
plt.close()