上述代码出现以下问题: * Running on local URL: http://0.0.0.0:7860 * To create a public link, set `share=True` in `launch()`. Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 349 examples [00:00, 13213.09 examples/s] Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 85 examples [00:00, 5189.16 examples/s] 从远程加载Tokenizer: deepseek-ai/deepseek-llm-7b-base Map: 0%| | 0/349 [00:00<?, ? examples/s] Map: 100%|██████████| 349/349 [00:00<00:00, 8892.39 examples/s] Map: 0%| | 0/85 [00:00<?, ? examples/s] Map: 100%|██████████| 85/85 [00:00<00:00, 4877.56 examples/s] Map: 0%| | 0/349 [00:00<?, ? examples/s] Map: 0%| | 0/349 [00:00<?, ? examples/s] Traceback (most recent call last): File "D:\Lib\site-packages\gradio\queueing.py", line 625, in process_events response = await route_utils.call_process_api( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ...<5 lines>... ) ^ File "D:\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api output = await app.get_blocks().process_api( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ...<11 lines>... ) ^ File "D:\Lib\site-packages\gradio\blocks.py", line 2193, in process_api result = await self.call_function( ^^^^^^^^^^^^^^^^^^^^^^^^^ ...<8 lines>... ) ^ File "D:\Lib\site-packages\gradio\blocks.py", line 1704, in call_function prediction = await anyio.to_thread.run_sync( # type: ignore ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ fn, *processed_input, limiter=self.limiter ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "D:\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync return await get_async_backend().run_sync_in_worker_thread( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ func, args, abandon_on_cancel=abandon_on_cancel, limiter=limiter ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "D:\Lib\site-packages\anyio\_backends\_asyncio.py", line 2470, in run_sync_in_worker_thread return await future ^^^^^^^^^^^^ File "D:\Lib\site-packages\anyio\_backends\_asyncio.py", line 967, in run result = context.run(func, *args) File "D:\Lib\site-packages\gradio\utils.py", line 894, in wrapper response = f(*args, **kwargs) File "C:\Users\Y\Desktop\python\100.py", line 130, in fine_tune_model tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) File "D:\Lib\site-packages\datasets\arrow_dataset.py", line 557, in wrapper out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs) ~~~~^^^^^^^^^^^^^^^^^^^^^^^ File "D:\Lib\site-packages\datasets\arrow_dataset.py", line 3079, in map for rank, done, content in Dataset._map_single(**dataset_kwargs): ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^ File "D:\Lib\site-packages\datasets\arrow_dataset.py", line 3525, in _map_single for i, batch in iter_outputs(shard_iterable): ~~~~~~~~~~~~^^^^^^^^^^^^^^^^ File "D:\Lib\site-packages\datasets\arrow_dataset.py", line 3475, in iter_outputs yield i, apply_function(example, i, offset=offset) ~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\Lib\site-packages\datasets\arrow_dataset.py", line 3398, in apply_function processed_inputs = function(*fn_args, *additional_args, **fn_kwargs) File "C:\Users\Y\Desktop\python\100.py", line 122, in tokenize_function return tokenizer( examples["text"], ...<2 lines>... padding="max_length" ) TypeError: 'str' object is not callable
时间: 2025-06-04 20:24:22 浏览: 7
### 修复 `TypeError: 'str' object is not callable` 错误
错误发生在 `tokenizer` 函数调用时,通常是因为 `tokenizer` 被重新赋值为字符串类型[^1]。以下是对问题的分析和修复方法:
#### 问题原因
在代码中,`tokenizer` 可能被意外地赋值为字符串,而不是 `AutoTokenizer` 对象。例如,在加载或初始化 `tokenizer` 的过程中,可能出现了异常处理不当的情况,导致 `tokenizer` 被设置为错误的类型。
#### 修复方法
以下是修复后的关键部分代码,确保 `tokenizer` 始终是 `AutoTokenizer` 对象:
```python
def load_tokenizer_locally_or_remotely(model_checkpoint):
"""加载本地或远程的Tokenizer"""
try:
local_path = os.path.join("local_models", model_checkpoint.replace("/", "_"))
if os.path.exists(local_path):
print(f"从本地加载Tokenizer: {local_path}")
return AutoTokenizer.from_pretrained(local_path, local_files_only=True)
else:
print(f"从远程加载Tokenizer: {model_checkpoint}")
return AutoTokenizer.from_pretrained(model_checkpoint)
except Exception as e:
# 确保返回的是一个有效的tokenizer对象,而不是字符串
raise ValueError(f"加载Tokenizer失败: {str(e)}")
```
在 `fine_tune_model` 函数中,确保 `tokenizer` 正确加载:
```python
def fine_tune_model(model_name, batch_size, learning_rate, num_epochs, progress=gr.Progress()):
"""微调大语言模型"""
progress(0.1, desc="加载数据集")
# 检查数据集是否存在
required_files = ["train.jsonl", "valid.jsonl", "test.jsonl"]
missing_files = [f for f in required_files if not os.path.exists(os.path.join(TEMP_DIR, f))]
if missing_files:
return f"数据集不完整,缺少文件: {', '.join(missing_files)}"
# 使用正确的load_dataset方法加载数据
progress(0.2, desc="加载数据集")
train_dataset = load_dataset('json', data_files=os.path.join(TEMP_DIR, "train.jsonl"))['train']
valid_dataset = load_dataset('json', data_files=os.path.join(TEMP_DIR, "valid.jsonl"))['train']
# 预处理数据
progress(0.3, desc="预处理数据")
model_checkpoint = MODEL_OPTIONS[model_name]
tokenizer = load_tokenizer_locally_or_remotely(model_checkpoint) # 确保正确加载tokenizer
# 格式化指令
def format_instruction(example):
text = f"### 问题:\n{example['question']}\n\n### 回答:\n{example['answer']}"
return {"text": text}
# 应用格式化
train_dataset = train_dataset.map(format_instruction)
valid_dataset = valid_dataset.map(format_instruction)
# 分词函数
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=512,
padding="max_length"
)
# 应用分词
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_valid = valid_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# 加载模型
progress(0.4, desc="加载模型")
model = load_model_locally_or_remotely(model_checkpoint)
# 训练参数
training_args = TrainingArguments(
output_dir=SAVE_DIR,
evaluation_strategy="epoch",
learning_rate=learning_rate,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
save_strategy="epoch",
logging_steps=10,
report_to="none",
fp16=torch.cuda.is_available(),
gradient_accumulation_steps=2,
warmup_ratio=0.1,
save_total_limit=2
)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# 训练器
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_valid,
data_collator=data_collator,
)
# 开始训练
progress(0.5, desc="开始训练")
trainer.train()
# 保存模型
progress(0.9, desc="保存模型")
model_save_path = os.path.join(SAVE_DIR, f"final_model_{model_name}")
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
return f"模型训练完成!模型已保存到 {model_save_path}"
```
#### 关键修复点
1. **确保 `tokenizer` 始终为 `AutoTokenizer` 对象**:通过 `load_tokenizer_locally_or_remotely` 函数加载 `tokenizer`,并在异常处理中避免将其赋值为字符串。
2. **检查 `tokenizer` 类型**:在使用 `tokenizer` 之前,可以添加类型检查以确保其为 `AutoTokenizer` 对象[^2]。
3. **异常处理**:在加载 `tokenizer` 或 `model` 时,捕获异常并抛出明确的错误信息,而不是将错误信息赋值给变量。
###
阅读全文