1.Chat Template
'1. 指令微调在预训练(LoRA微调)之后'
# 预训练使大模型成为“领域专家”
# 指令微调令大模型学会表达
每一个大模型的指令微调都不一样;
所以一定要根据官方发布的格式做指令微调
'2. Chat_Template的源代码'
# Chat_Template.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")
optimizer = torch.optim.AdamW(model.parameters())
dialog = [{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "天空为什么是蓝色的?"},
{"role": "assistant", "content": "这是由于光的散射引起的。"}]
input = tokenizer.apply_chat_template(dialog, return_tensors="pt")
input = {k: v.to("cuda") for k, v in input.items()}
#设置labels和inputs一致
input["labels"] = input["input_ids"].clone()
output = model(**input)
#获取模型的loss
loss = output.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
#保存模型
model.save_pretrained("output_dir")
2.Completions only
'1. 只对回答部分做计算loss'
由于Chat Template会对指令的所有内容计算loss
为了优化该部分,使用Completions only
'2. Completions onlyの源代码'
# Completions_only.py
import functools
import json
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from torch.utils.data import DataLoader, Dataset
import torch
class SFTDataset(Dataset):
def __init__(self, file_path, tokenizer):
super().__init__()
self.file_path = file_path
self.examples = self._load_data(self.file_path)
self.tokenizer = tokenizer
@staticmethod
def _load_data(file_path):
items = []
with open(file_path, "r", encoding="utf8")as f:
for line in f:
item = json.loads(line)
items.append(item)
return items
def __getitem__(self, index):
example = self.examples[index]
dialog = [{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": example["query"]},
{"role": "assistant", "content": example["answer"]}]
chat = tokenizer.apply_chat_template(dialog, tokenize=False)
return chat
def __len__(self):
return len(self.examples)
model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
peft_config = LoraConfig(
r=8,
target_modules=["q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
task_type=TaskType.CAUSAL_LM,
lora_alpha=16,
lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.to("cuda")
optimizer = torch.optim.AdamW(model.parameters())
def sft_collate(batch, tokenizer, end_str, max_length):
end_str = "<|start_header_id|>assistant<|end_header_id""|>\n\n"
inputs = tokenizer(batch, max_length=max_length, padding=True, truncation=True)
input_ids = inputs["input_ids"]
input_len = len(input_ids[0])
end_ids = tokenizer(end_str)["input_ids"]
end_id_len = len(end_ids)
loss_mask = []
for input_id in input_ids:
for i in range(len(input_id) - end_id_len, -1, -1):
if input_id[i:i + end_id_len] == end_ids:
mask = [1] * (input_len - 1)
mask[:i + end_id_len - 1] = [0] * (i + end_id_len - 1)
loss_mask.append(mask)
break
if i == 0: # 所有回答部分都被截断
loss_mask.append([0] * (input_len - 1))
inputs = {k: torch.tensor(v) for k, v in inputs.items()}
loss_mask = torch.tensor(loss_mask)
return inputs, loss_mask
collate_fn = functools.partial(sft_collate,
tokenizer=tokenizer,
end_str="<|start_header_id|>assistant<|end_header_id""|>\n\n",
max_length=50)
sft_dataset = SFTDataset("./data/sft_data.json", tokenizer)
data_loader = DataLoader(sft_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)
epoch = 10
for i in range(epoch):
for inputs, loss_mask in data_loader:
inputs = {k: v.to("cuda") for k, v in inputs.items()}
loss_mask = loss_mask.to("cuda")
logits = model(**inputs).logits[:, :-1, :]
labels = inputs["input_ids"][:, 1:]
logits = logits.reshape(-1, logits.size(-1))
labels = labels.reshape(-1)
loss_mask = loss_mask.reshape(-1)
loss = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
loss = loss * loss_mask
loss = torch.mean(loss)
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(loss.item())
3.NEFTune
'1. Noisy Embeddings Finetuning'
通过embedding,将离散对象映射到连续向量空间中的点
对embedding做噪声处理,可以提高模型的表现
'2. NEFTuneの源代码'
import functools
import json
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from torch.utils.data import DataLoader, Dataset
import torch
class SFTDataset(Dataset):
def __init__(self, file_path, tokenizer):
super().__init__()
self.file_path = file_path
self.examples = self._load_data(self.file_path)
self.tokenizer = tokenizer
@staticmethod
def _load_data(file_path):
items = []
with open(file_path, "r", encoding="utf8")as f:
for line in f:
item = json.loads(line)
items.append(item)
return items
def __getitem__(self, index):
example = self.examples[index]
dialog = [{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": example["query"]},
{"role": "assistant", "content": example["answer"]}]
chat = tokenizer.apply_chat_template(dialog, tokenize=False)
return chat
def __len__(self):
return len(self.examples)
model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
peft_config = LoraConfig(
r=8,
target_modules=["q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
task_type=TaskType.CAUSAL_LM,
lora_alpha=16,
lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.to("cuda")
optimizer = torch.optim.AdamW(model.parameters())
def sft_collate(batch, tokenizer, end_str, max_length):
inputs = tokenizer(batch, max_length=max_length, padding=True, truncation=True)
input_ids = inputs["input_ids"]
input_len = len(input_ids[0])
end_ids = tokenizer(end_str)["input_ids"]
end_id_len = len(end_ids)
loss_mask = []
for input_id in input_ids:
for i in range(len(input_id) - end_id_len, -1, -1):
if input_id[i:i + end_id_len] == end_ids:
mask = [1] * (input_len - 1)
mask[:i + end_id_len - 1] = [0] * (i + end_id_len - 1)
loss_mask.append(mask)
break
if i == 0: # 所有回答部分都被截断
loss_mask.append([0] * (input_len - 1))
inputs = {k: torch.tensor(v) for k, v in inputs.items()}
loss_mask = torch.tensor(loss_mask)
return inputs, loss_mask
collate_fn = functools.partial(sft_collate,
tokenizer=tokenizer,
end_str="<|start_header_id|>assistant<|end_header_id""|>\n\n",
max_length=500)
sft_dataset = SFTDataset("./data/sft_data.json", tokenizer)
data_loader = DataLoader(sft_dataset, batch_size=2, collate_fn=collate_fn, shuffle=True)
epoch = 10
neftune_noise_alpha = 10
for i in range(epoch):
for inputs, loss_mask in data_loader:
input_ids = inputs.pop("input_ids")
input_embeddings = model.base_model.model.model.embed_tokens(input_ids)
dims = torch.tensor(input_embeddings.size(1) * input_embeddings.size(2))
mag_norm = neftune_noise_alpha / torch.sqrt(dims)
input_embeddings = input_embeddings + torch.zeros_like(input_embeddings).uniform_(-mag_norm, mag_norm)
inputs["inputs_embeds"] = input_embeddings
inputs = {k: v.to("cuda") for k, v in inputs.items()}
loss_mask = loss_mask.to("cuda")
logits = model(**inputs).logits[:, :-1, :]
labels = input_ids[:, 1:].to("cuda")
logits = logits.reshape(-1, logits.size(-1))
labels = labels.reshape(-1)
loss_mask = loss_mask.reshape(-1)
loss = torch.nn.functional.cross_entropy(logits, labels, reduction="none")
loss = loss * loss_mask
loss = torch.mean(loss)
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(loss.item())
4.SFT_Trainer
# 用Trainer包监督微调大模型
import json
import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
items = []
with open("./data/sft_data.json", "r", encoding="utf8")as f:
for line in f:
item = json.loads(line)
items.append({"prompt": item["query"], "completion": item["answer"]})
dataset = Dataset.from_list(items)
model_path = r'D:\work\models\Meta-Llama-3.1-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config,torch_dtype=torch.float16)
peft_config = LoraConfig(
r=8,
target_modules=["q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
task_type=TaskType.CAUSAL_LM,
lora_alpha=16,
lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
sft_config = SFTConfig(output_dir="/tmp",
neftune_noise_alpha=10,
per_device_train_batch_size=1,
max_seq_length=100,
num_train_epochs=10,
logging_steps=10,
logging_strategy="steps")
response_template = "<|start_header_id|>assistant<|end_header_id""|>\n\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
args=sft_config,
data_collator=collator
)
trainer.train()
学习视频:【大模型微调看这个视频就够了 SFT NEFTune】 大模型微调看这个视频就够了 SFT NEFTune_哔哩哔哩_bilibili