1 图片推理
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
model = Qwen2VLForConditionalGeneration.from_pretrained(
"/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-2B-Instruct",
torch_dtype="auto",
device_map="auto"
)
processor = AutoProcessor.from_pretrained("/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-2B-Instruct")
image = Image.open("/home/fyo/Pictures/earthquake.jpg")
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
},
{
"type": "text", "text": "描述这张图."},
],
}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs