以下所有代码可查看 https://github.com/anda522/gpt2
原始的大模型仅仅是进行文本的补全,并不能遵循人给出的文本指令。指令微调就是希望大模型根据文本指令给出我们想要的输出。
数据下载
指令微调中我们需要一个监督数据集,这个数据集包括指令文本、输入文本和预期的输出文本。
我们直接下载数据集,代码如下:
import json
import os
import urllib.request
def download_and_load_file(file_path, url):
if not os.path.exists(file_path):
with urllib.request.urlopen(url) as response:
text_data = response.read().decode("utf-8")
with open(file_path, "w", encoding="utf-8") as file:
file.write(text_data)
else:
with open(file_path, "r", encoding="utf-8") as file:
text_data = file.read()
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
return data
file_path = "./data/instruction_finetune/instruction-data.json"
url = (
"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
"/main/ch07/01_main-chapter-code/instruction-data.json"
)
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))
对于我们这个简单的指令数据集,数据为一个json格式的文件,每个数据为一个字典,大概为以下格式:
{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}
单个数据的格式我们采用Alpaca风格(用于指令微调的原始提示模板)的提示进行格式化:
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Identify the correct spelling of the following word.
### Input:
Ocassion
### Response:
The correct spelling is 'Occasion.'
处理代码如下:
def format_input(entry):
instruction_text = (
f"Below is an instruction that describes a task. "
f"Write a response that appropriately completes the request."
f"\n\n### Instruction:\n{entry['instruction']}"
)
input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
return instruction_text + input_text
接下来需要将数据集分为训练集、验证集和测试集:
train_portion = int(len(data) * 0.85) # 85% for training
test_portion = int(len(data) * 0.1) # 10% for testing
val_portion = len(data) - train_portion - test_portion # Remaining 5% for validation
train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]
数据处理
构造Dataset数据集:
class InstructionDataset(Dataset):
def __init__(self, data, tokenizer):
self.data = data
# Pre-tokenize texts
self.encoded_texts = []
for entry in data:
instruction_plus_input = format_input(entry)
response_text = f"\n\n### Response:\n{entry['output']}"
full_text = instruction_plus_input + response_text
self.encoded_texts.append(
tokenizer.encode(full_text)
)
def __getitem__(self, index):
return self.encoded_texts[index]
def __len__(self):
return len(self.data)
构造dataloader时,我们需要对Dataset中的数据进行自定义的操作,需要自定义 custom_collate_fn
函数:
def custom_collate_fn(
batch,
pad_token_id=50256,
ignore_index=-100,
allowed_max_length=None,
device="cpu"
):
# Find the longest sequence in the batch
batch_max_length = max(len(item)+1 for item in batch)
# Pad and prepare inputs and targets
inputs_lst, targets_lst = [], []
for item in batch:
new_item = item.copy()
# Add an <|endoftext|> token
new_item += [pad_token_id]
# Pad sequences to max_length
padded = (
new_item + [pad_token_id] *
(batch_max_length - len(new_item))
)
inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
# New: Replace all but the first padding tokens in targets by ignore_index
mask = targets == pad_token_id
indices = torch.nonzero(mask).squeeze()
if indices.numel() > 1:
targets[indices[1:]] = ignore_index
# New: Optionally truncate to maximum sequence length
if allowed_max_length is not None:
inputs = inputs[:allowed_max_length]
targets = targets[:allowed_max_length]
inputs_lst.append(inputs)
targets_lst.append(targets)
# Convert list of inputs and targets to tensors and transfer to target device
inputs_tensor = torch.stack(inputs_lst).to(device)
targets_tensor = torch.stack(targets_lst).to(device)
return inputs_tensor, targets_tensor
customized_collate_fn = partial(
custom_collate_fn,
device=device,
allowed_max_length=1024
)
dataloader构造:
train_dataset = InstructionDataset(train_data, tokenizer)
logging.info(f"train_dataset: {len(train_dataset)}")
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
collate_fn=customized_collate_fn,
shuffle=True,
drop_last=True,
num_workers=num_workers
)
val_dataset = InstructionDataset(val_data, tokenizer)
logging.info(f"val_dataset: {len(val_dataset)}")
val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
collate_fn=customized_collate_fn,
shuffle=False,
drop_last=False,
num_workers=num_workers
)
test_dataset = InstructionDataset(test_data, tokenizer)
logging.info(f"test_dataset: {len(test_dataset)}")
test_loader = DataLoader(
test_dataset,
batch_size=batch_size,
collate_fn=customized_collate_fn,
shuffle=False,
drop_last=False,
num_workers=num_workers
)
模型微调
我们基于GPT-Medium进行指令微调,GPT-Medium下载地址:
https://huggingface.co/openai-community/gpt2-medium/tree/main
下面的训练思路和之前的GPT2训练思路基本一样,直接给出来:
from all_code import (
GPTModel,
load_weights,
text_to_token_ids,
token_ids_to_text,
generate
)
from transformers import GPT2Model
weights_path = "weights/gpt2-medium"
BASE_CONFIG = {
"vocab_size": 50257, # Vocabulary size
"ctx_len": 1024, # Context length
"drop_rate": 0.0, # Dropout rate
"qkv_bias": True # Query-key-value bias
}
model_configs = {
"gpt2-small": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
"gpt2-medium": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
"gpt2-large": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
"gpt2-xl": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
gpt_hf = GPT2Model.from_pretrained(weights_path)
gpt_hf.eval()
BASE_CONFIG.update(model_configs["gpt2-medium"])
model = GPTModel(BASE_CONFIG)
load_weights(model, gpt_hf)
model.eval()
model.to(device)
def calc_loss_batch(input_batch, target_batch, model, device):
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
logits = model(input_batch)
logits = logits.view(-1, logits.shape[-1])
target_batch = target_batch.view(-1)
loss = torch.nn.functional.cross_entropy(logits, target_batch)
return loss
def calc_loss_loader(loader, model, device, num_batches=None):
total_loss = 0.
if len(loader) == 0:
return float("nan")
elif num_batches is None:
num_batches = len(loader)
else:
num_batches = min(num_batches, len(loader))
for i, (input_batch, target_batch) in enumerate(loader):
if i >= num_batches:
break
loss = calc_loss_batch(input_batch, target_batch, model, device)
total_loss += loss.item()
return total_loss / num_batches
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
eval_freq, eval_iter, start_context, tokenizer):
# 初始化列表以跟踪损失和已观察到的token
train_losses, val_losses, track_tokens_seen = [], [], []
tokens_seen, global_step = 0, -1
# 主要的训练步骤
for epoch in range(num_epochs):
model.train() # 将模型设置为训练模式
for input_batch, target_batch in train_loader:
optimizer.zero_grad() # 每个epoch开始之前重新设置梯度
loss = calc_loss_batch(input_batch, target_batch, model, device)
loss.backward() # 计算损失梯度
optimizer.step() # 利用损失梯度更新模型参数
tokens_seen += input_batch.numel()
global_step += 1
# 可选的验证评估步骤
if global_step % eval_freq == 0:
train_loss, val_loss = evaluate_model(
model, train_loader, val_loader, device, eval_iter)
train_losses.append(train_loss)
val_losses.append(val_loss)
track_tokens_seen.append(tokens_seen)
logging.info(f"Epoch {epoch+1} (Step {global_step:06d}): "
f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
# 在每个epoch完成后打印一个生成的文本示例
generate_and_print_sample(
model, tokenizer, device, start_context
)
return train_losses, val_losses, track_tokens_seen
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
model.eval()
with torch.no_grad():
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
model.train()
return train_loss, val_loss
def generate_and_print_sample(model, tokenizer, device, start_context):
model.eval()
context_size = model.pos_emb.weight.shape[0]
encoded = text_to_token_ids(start_context, tokenizer).to(device)
with torch.no_grad():
token_ids = generate(
model=model, idx=encoded,
max_new_tokens=50, context_size=context_size,
temperature=1.5, top_k=10
)
decoded_text = token_ids_to_text(token_ids, tokenizer)
logging.info(decoded_text.replace("\n", " ")) # 简洁的打印格式
model.train()
训练完成后保存模型:
import time
start_time = time.time()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context=format_input(val_data[0]), tokenizer=tokenizer
)
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
logging.info(f"Training completed in {execution_time_minutes:.2f} minutes.")
torch.save(model.state_dict(), "weights/instruction-finetune/instruction_model.pth")