知识点回顾:
- tensorboard的发展历史和原理
- tensorboard的常见操作
- tensorboard在cifar上的实战:MLP和CNN模型@浙大疏锦行
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import os
import datetime
# 1. 设备配置和日志路径
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
log_dir = os.path.join("runs", "cifar10_resnet18", datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
writer = SummaryWriter(log_dir)
# 2. 数据预处理(适配预训练模型)
train_transform = transforms.Compose([
transforms.Resize(224),
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # ImageNet统计量
])
test_transform = transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# 3. 数据集加载
train_dataset = datasets.CIFAR10(
root='./data', train=True, download=True, transform=train_transform)
test_dataset = datasets.CIFAR10(
root='./data', train=False, download=True, transform=test_transform)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
# 4. 模型定义(使用预训练)
def create_finetune_model(num_classes=10, freeze_backbone=False):
model = models.resnet18(weights='IMAGENET1K_V1')
# 冻结卷积层参数
if freeze_backbone:
for param in model.parameters():
param.requires_grad = False
# 修改全连接层
model.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(model.fc.in_features, num_classes)
)
return model.to(device)
# 5. 训练函数(集成TensorBoard监控)
def train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=20):
global_step = 0
best_acc = 0.0
# 记录模型图和样例图片
sample_images, _ = next(iter(train_loader))
writer.add_graph(model, sample_images.to(device))
writer.add_images('Sample Training Images', sample_images[:8])
for epoch in range(epochs):
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (inputs, labels) in enumerate(train_loader):
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# 记录batch指标
running_loss += loss.item()
_, predicted = outputs.max(1)
total += labels.size(0)
correct += predicted.eq(labels).sum().item()
# TensorBoard记录
if global_step % 50 == 0:
writer.add_scalar('Training/Loss', loss.item(), global_step)
writer.add_scalar('Training/Accuracy', 100.*correct/total, global_step)
writer.add_scalar('Training/Learning Rate', optimizer.param_groups[0]['lr'], global_step)
global_step += 1
# 验证阶段
model.eval()
test_loss = 0.0
test_correct = 0
test_total = 0
wrong_images = []
wrong_labels = []
wrong_preds = []
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
test_loss += loss.item()
_, predicted = outputs.max(1)
test_total += labels.size(0)
test_correct += predicted.eq(labels).sum().item()
# 收集错误样本
wrong_mask = (predicted != labels)
if wrong_mask.any():
wrong_images.append(inputs[wrong_mask][:8].cpu())
wrong_labels.extend(labels[wrong_mask][:8].cpu().numpy())
wrong_preds.extend(predicted[wrong_mask][:8].cpu().numpy())
# 记录epoch指标
train_acc = 100. * correct / total
test_acc = 100. * test_correct / test_total
writer.add_scalars('Accuracy', {'train': train_acc, 'val': test_acc}, epoch)
writer.add_scalars('Loss', {'train': running_loss/len(train_loader), 'val': test_loss/len(test_loader)}, epoch)
# 记录错误样本
if wrong_images:
wrong_images = torch.cat(wrong_images)[:8]
writer.add_images('Wrong Predictions', wrong_images, epoch)
writer.add_text('Wrong Labels',
'\n'.join([f'True: {test_dataset.classes[l]} | Pred: {test_dataset.classes[p]}'
for l, p in zip(wrong_labels, wrong_preds)]), epoch)
# 更新学习率和保存最佳模型
if scheduler:
scheduler.step(test_acc)
if test_acc > best_acc:
best_acc = test_acc
torch.save(model.state_dict(), f'best_model_{test_acc:.2f}.pth')
print(f'Epoch [{epoch+1}/{epochs}] '
f'Train Acc: {train_acc:.2f}% | Val Acc: {test_acc:.2f}% | '
f'LR: {optimizer.param_groups[0]["lr"]:.2e}')
writer.close()
return best_acc
# 6. 主函数配置
def main():
# 初始化模型(前5个epoch冻结卷积层)
model = create_finetune_model(freeze_backbone=True)
# 仅训练全连接层
optimizer = optim.Adam(model.fc.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5, verbose=True)
# 第一阶段:仅训练全连接层
print("第一阶段:训练全连接层(冻结主干)")
train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=5)
# 第二阶段:解冻全部参数
print("\n第二阶段:解冻全部参数进行微调")
for param in model.parameters():
param.requires_grad = True
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.5)
train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, epochs=15)
if __name__ == "__main__":
main()
print(f"启动TensorBoard查看结果:\ntensorboard --logdir={log_dir}")