在使用ipex-llm中的speculative模式报错
RuntimeError: setStorage: sizes [1, 4, 396, 128], strides [201216, 50304, 128, 1], storage offset 0, and itemsize 4 requiring a storage size of 806400 are out of bounds for storage of size 804864
转到xpu上时报错,RuntimeError: PyTorch is not linked with support for xpu devices
import random
import time
import numpy as np
import torch
from transformers import AutoTokenizer#, AutoModelForCausalLM
from ipex_llm.transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained('/home/models/swy/ouput/secQwen',
load_in_4bit=True,
device_map='cpu',
speculative=True
).to('cpu') "无法转换到xpu上"
tokenizer = AutoTokenizer.from_pretrained("/home/models/swy/ouput/secQwen", trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained("/home/models/swy/ouput/secQwen", device_map='auto', trust_remote_code=True)
data = [
{
"instruction":"你是一个信用卡诈骗案件识别专家,请帮助我分析下面案件是否属于信用卡诈骗案件",
"input":"2020年4月21日,我刑警大队接到肇东市居民马兴阁报案称:2020年4月20日在肇东市十七道街北万福家园楼下大丰收烧烤店自己家中,接到一个陌生电话问其是否需要贷款,马兴阁说需要贷款,对方让其添加QQ为好友后,让其下载泛华金融APP让其在泛华金融APP上办理贷款业务,在办理贷款业务时被骗走人民币3000元。",
"output":"该案件不属于诈骗案件但不属于信用卡诈骗案件"
} ]
time__s = []
for da in data:
question = da['instruction'] + "案件描述如下:" + da['input']
# question = "你叫什么"
s_time = time.time()
inputs = tokenizer(question, return_tensors="pt")
input_ids = inputs["input_ids"]
print("**************************** 开始进行模型预测 ****************************")
pred = model.generate(input_ids,
max_new_tokens=300,
# max_new_tokens=1,
# return_dict_in_generate=True,
# output_scores=True,
)
for input_ids, output_ids in zip(input_ids, pred):
generated_ids = output_ids[len(input_ids):]
# generated_ids = [
# output_ids[len(input_ids):] for input_ids, output_ids in zip(input_ids, pred)
# ]
print("**************************** 开始进行decode ****************************")
text = tokenizer.decode(generated_ids, skip_special_tokens=True)
e_time = time.time()
times = e_time-s_time
print("执行时间:", times, "秒")
print(text)
time__s.append((times,len(text)))
t = 0
l = 0
for t,l in time__s:
t += t
l += l
print("模型的平均耗时约为:" + str(l/t) + " 字/s" )