原标题: ChatGPT中文微调代码详解,带结果
导读:
近年来,人工智能技术的发展取得了长足进步,其中自然语言处理领域的突破尤为显著,作为自然语言处理的重要应用之一,“对话生成”被广泛应用于客服、机器人等场景中,而GPT模型(Gen...
近年来,人工智能技术的发展取得了长足进步,其中自然语言处理领域的突破尤为显著,作为自然语言处理的重要应用之一,“对话生成”被广泛应用于客服、机器人等场景中,而GPT模型(Generative Pre-trained Transformer)则是目前非常成功和受欢迎的对话生成模型之一。
本文将详细介绍如何使用chatglm库进行ChatGPT中文微调,并给出具体代码示例。
首先需要准备以下环境及数据:
1. Python 3.x 环境
2. GPU 加速支持(可选)
3. ChatGPT预训练权重文件
4. 微调数据集:包含输入输出配对样本
接下来我们开始编写代码:
Step 1: 安装 chatglm 库
```
!pip install git+
Step 2: 导入所需库和模块
```python
import torch
from chatglm import GptChineseTokenizer, Gpt2ForMaskedLMOriLM
from transformers.optimization import AdamW
from torch.utils.data.dataloader import DataLoader
Step 3: 配置训练参数并加载预训练模型和tokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = GptChineseTokenizer.from_pretrained('uer/gpt2-chinese-cluecorpussmall')
model = Gpt2ForMaskedLMOriLM.from_pretrained('uer/gpt2-chinese-cluecorpussmall').to(device)
Step 4: 定义微调数据集类
class ConversationDataset(torch.utils.data.Dataset):
def __init__(self, conversations, tokenizer, max_len=1024):
self.conversations = conversations
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.conversations)
def __getitem__(self, index):
conversation = self.conversations[index]
inputs_dict = {'input_ids': [], 'attention_mask': []}
for message in conversation['messages']:
inputs = self.tokenizer.encode(message['content'])
if len(inputs) > (self.max_len - 2):
inputs = inputs[:max_length-2]
tokens_encoded.append(tokenizer.unk_token_id)] * (max_length-len(inputs)))
input_ids=[tokenizer.cls_token_id]+inputs+[tokenizer.sep_token_id])
attention_mask=[1]*len(input_ids))
padding_length=max_length-len(input_ids))
Attention_masks.extend(attention_mask)
Input_IDs.extend(input_IDS)
outputs=torch.tensor(int(conversation["label"]))
return {"input_CD":torch.LongTensor(Input_IDs),"Attention_Masks":torch.Longtensor(Attention_masks), "outputs":output}
Step 5: 加载和准备微调数据集
train_dataset=ConversationDataSet(train_samples,tokenizer,max_len=512)
val_dataset=ConversatonDataset(val_samples,tokenizer,max_len=512)
test_dataset=ConversatonDataset(test_samples,tokenizer,max_len=512)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size)
test_data_loader =Dataloader( test_dataset,batchsize=batch-size)
Step 6: 定义微调函数
def train(model, data_loader):
model.train()
total_loss = 0
for bidx, inputs in enumerate(data_loader):
input_ids=inputs["input_IDs"].to(device))
attention_mask=inputs["Attention_Masks"]).to(device))
outputs=model(inputs={'input_ids':input_ids,'attention_mask':attention_maks},
labels=input.learn().data.to(device)))
loss=output[0]
optimizer.zero_grad()
los.backward()
#梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
totol+=loss.item()
return tota-loss/len(data_laoder)
def evaluate(modle,data_laoder):
model.eval()
total_losstotal_loss==0
with torch.no-grad():
for bidx,inputsin enumerate(dta_lader):
input_ids=input['inpt_IDS']).to(devie))
attention mask['Attention_Mask']).to(deice))))
output=model(inputizer.encode(message['content'])
if len(inputs) > (self.max_len - 2):}:
inputs == nputs[:mac_length-2 ]
tokens_encoded.append(tokenzer.unk_token_id)] * (max_frngth-len(nputs)))
Step 7: 开始微调模型
定义训练参数:
learning_rate = 5e-5
num_epochs = 3
batch_size = 4
max_grad_norm =1.0
optimizer=AtomW(model.parameters(),lr==learning_rate)
scheduler=Scheudler(optimizer,epochs=len(train_data_loader),lg_warmup=100,t_wd_factor-0.01)
开始训练:
best_loss=float('inf')
for epoch in range(num_epochs):
print(f"Epoch {epoch+1}/{num_epochs}")
train_loss=train(model,train_data_loader)
if val_dataset is not None:
val_los evaluate(modle,val_dat_lader}
scheduler.step()
if val_dataset is not None and va_loss < best_loss:
best_loss=val