import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader import pandas as pd # 1. 数据集类 class NameDataset(Dataset): def __init__(self, file_path): data = pd.read_excel(file_path, engine='openpyxl') self.names = data['姓名'].tolist() self.genders = data['性别'].map({'男': 0, '女': 1}).tolist() # 构建字符字典 all_chars = ''.join(self.names) self.chars = sorted(list(set(all_chars))) self.char2idx = {char: idx for idx, char in enumerate(self.chars)} self.vocab_size = len(self.chars) # 计算最大姓名长度 self.max_length = max(len(name) for name in self.names) def __len__(self): return len(self.names) def __getitem__(self, idx): name = self.names[idx] gender = self.genders[idx] # 字符转索引并填充到固定长度 char_indices = [self.char2idx[char] for char in name] padded = torch.nn.functional.pad( torch.tensor(char_indices, dtype=torch.long), (0, self.max_length - len(char_indices)), value=self.char2idx[' '] # 使用空格作为填充符 ) return padded, torch.tensor(gender, dtype=torch.long) # 2. 模型定义 class NameGenderCNN(nn.Module): def __init__(self, vocab_size, embed_dim=32, hidden_dim=64): super().__init__() self.embedding = nn.Embedding(vocab_size, embed_dim) self.conv1 = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1) self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1) self.fc = nn.Linear(hidden_dim, 2) def forward(self, x): x = self.embedding(x).permute(0, 2, 1) # (batch, embed, length) x = nn.functional.relu(self.conv1(x)) x = nn.functional.relu(self.conv2(x)) x = x.mean(dim=2) # 全局平均池化 return self.fc(x) # 3. 训练流程 def train_model(): # 初始化数据集 dataset = NameDataset("D:\\名单数据库.xlsx") dataloader = DataLoader(dataset, batch_size=8, shuffle=True) # 初始化模型 model = NameGenderCNN(vocab_size=dataset.vocab_size) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # 训练循环 for epoch in range(50): model.train() total_loss = 0 correct = 0 total = 0 for batch_x, batch_y in dataloader: optimizer.zero_grad() outputs = model(batch_x) loss = criterion(outputs, batch_y) loss.backward() optimizer.step() total_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += batch_y.size(0) correct += (predicted == batch_y).sum().item() accuracy = correct / total print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader):.4f}, Accuracy: {accuracy:.4f}') # 保存模型 torch.save(model.state_dict(), 'gender_model.pth') print("Model saved successfully!") # 4. 预测函数 def predict_gender(name, model_path='gender_model.pth'): model = NameGenderCNN(vocab_size=dataset.vocab_size) model.load_state_dict(torch.load(model_path)) model.eval() dataset = NameDataset("D:\\名单数据库.xlsx") char_indices = [dataset.char2idx.get(c, dataset.char2idx[' ']) for c in name] padded = torch.nn.functional.pad( torch.tensor(char_indices, dtype=torch.long), (0, dataset.max_length - len(char_indices)), value=dataset.char2idx[' '] ).unsqueeze(0) with torch.no_grad(): output = model(padded) _, predicted = torch.max(output, 1) return '男' if predicted.item() == 0 else '女' # 执行训练 train_model() # 示例预测 print(predict_gender("陈光健")) # 男 print(predict_gender("梁慧婷")) # 女 print(predict_gender("张鑫")) # 男 print(predict_gender("王珈桐")) # 女 将上述代码修改正确后给我
时间: 2025-04-07 13:04:23 浏览: 81
<<
以下是对原始代码中的潜在问题进行修正后的完整版本:
### 主要改动点说明:
1. **`NameDataset` 类中 `char2idx[' ']` 错误**:
如果输入数据集中没有空格,则 `char2idx` 中可能不存在键 `' '`,这会导致报错。因此,在构建字符字典时显式添加了空格。
2. **预测部分加载 `dataset` 冗余**:
在预测阶段无需重新初始化整个数据集,只需复用训练时生成的字符映射表即可。
3. **模型路径和文件读取路径改进**:
将硬编码路径替换为可配置参数,方便切换不同环境下的运行需求。
---
#### 修改后的完整代码
```python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
# 1. 数据集类
class NameDataset(Dataset):
def __init__(self, file_path):
data = pd.read_excel(file_path, engine='openpyxl')
# 确保数据列存在
if '姓名' not in data.columns or '性别' not in data.columns:
raise ValueError("Excel 文件必须包含 '姓名' 和 '性别' 列")
self.names = data['姓名'].tolist()
self.genders = data['性别'].map({'男': 0, '女': 1}).fillna(-1).astype(int).tolist()
# 构建字符字典(包括空格)
all_chars = ''.join(self.names) + ' '
unique_chars = sorted(list(set(all_chars)))
self.chars = ['<PAD>'] + unique_chars # 添加特殊标记 <PAD>
self.char2idx = {char: idx for idx, char in enumerate(self.chars)}
self.vocab_size = len(self.chars)
# 计算最大姓名长度
self.max_length = max(len(name) for name in self.names)
def __len__(self):
return len(self.names)
def __getitem__(self, idx):
name = self.names[idx]
gender = self.genders[idx]
# 字符转索引并填充到固定长度
char_indices = [self.char2idx[char] for char in name if char in self.char2idx]
padded = torch.nn.functional.pad(
torch.tensor(char_indices, dtype=torch.long),
(0, self.max_length - len(char_indices)),
value=self.char2idx['<PAD>']
)
return padded, torch.tensor(gender, dtype=torch.long)
# 2. 模型定义
class NameGenderCNN(nn.Module):
def __init__(self, vocab_size, embed_dim=32, hidden_dim=64):
super(NameGenderCNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.conv1 = nn.Conv1d(embed_dim, hidden_dim, kernel_size=3, padding=1)
self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
self.fc = nn.Linear(hidden_dim, 2)
def forward(self, x):
x = self.embedding(x).permute(0, 2, 1) # (batch, embed, length)
x = nn.functional.relu(self.conv1(x))
x = nn.functional.relu(self.conv2(x))
x = x.mean(dim=2) # 全局平均池化
return self.fc(x)
# 3. 训练流程
def train_model(data_file="D:\\名单数据库.xlsx", model_save_path="gender_model.pth"):
# 初始化数据集
dataset = NameDataset(data_file)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
# 初始化模型
model = NameGenderCNN(vocab_size=dataset.vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练循环
for epoch in range(50):
model.train()
total_loss = 0
correct = 0
total = 0
for batch_x, batch_y in dataloader:
optimizer.zero_grad()
outputs = model(batch_x)
loss = criterion(outputs, batch_y)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += batch_y.size(0)
correct += (predicted == batch_y).sum().item()
accuracy = correct / total
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(dataloader):.4f}, Accuracy: {accuracy:.4f}")
# 保存模型
torch.save(model.state_dict(), model_save_path)
print("Model saved successfully!")
# 4. 预测函数
def predict_gender(name, model_path='gender_model.pth', dataset=None):
if dataset is None:
raise ValueError("predict_gender 函数需要传入已知的 dataset 对象以获取词汇表信息")
model = NameGenderCNN(vocab_size=dataset.vocab_size)
model.load_state_dict(torch.load(model_path))
model.eval()
# 名称转换为索引序列
char_indices = [
dataset.char2idx.get(c, dataset.char2idx['<PAD>'])
for c in name[:dataset.max_length]
]
padded = torch.nn.functional.pad(
torch.tensor(char_indices, dtype=torch.long),
(0, dataset.max_length - len(char_indices)),
value=dataset.char2idx['<PAD>']
).unsqueeze(0)
with torch.no_grad():
output = model(padded)
_, predicted = torch.max(output, 1)
return '男' if predicted.item() == 0 else '女'
# 执行训练
if __name__ == "__main__":
train_data_file = "D:\\名单数据库.xlsx"
trained_model_path = "gender_model.pth"
# 步骤一:训练模型
train_dataset = NameDataset(train_data_file)
train_model(data_file=train_data_file, model_save_path=trained_model_path)
# 步骤二:测试预测功能
test_names = ["陈光健", "梁慧婷", "张鑫", "王珈桐"]
predictions = [(name, predict_gender(name, model_path=trained_model_path, dataset=train_dataset)) for name in test_names]
for name, pred in predictions:
print(f"{name} -> {pred}")
```
---
#### 测试结果示例输出:
假设经过充分训练后:
```
陈光健 -> 男
梁慧婷 -> 女
张鑫 -> 男
王珈桐 -> 女
```
---
阅读全文
相关推荐



















