活动介绍
file-type

解决PyTorch DataLoader batch_size加载问题

PDF文件

下载需积分: 49 | 3.55MB | 更新于2024-08-09 | 194 浏览量 | 18 下载量 举报 收藏
download 立即下载
"保存数据-pytorch dataloader 取batch_size时候出现bug的解决方式" 在PyTorch中,dataloader是训练神经网络模型时常用的数据加载工具,它能够批量加载数据并进行预处理,从而优化内存管理和计算效率。然而,在实际使用中,可能会遇到在设置`batch_size`时出现bug的情况。这通常是由于以下几个原因: 1. **数据集大小与batch_size的关系**:`batch_size`应小于等于数据集的大小。如果`batch_size`大于数据集的样本数,会导致无法形成完整的批次,引发错误。确保`batch_size`的设定合理,避免超出数据集的实际容量。 2. **数据集的划分**:在训练过程中,通常会将数据集分为训练集、验证集和测试集。确保你在使用dataloader时指向正确的数据集,并且数据集已经被正确地切分。 3. **DataLoader的初始化**:确保在创建Dataloader对象时,正确设置了`shuffle`和`drop_last`参数。`shuffle=True`会在每次迭代时随机打乱数据顺序,而`drop_last=True`会丢弃最后一个不足`batch_size`的批次。如果数据集的样本数不能被`batch_size`整除,`drop_last=True`可以避免因最后一个批次不足`batch_size`而引发的问题。 4. **内存问题**:当`batch_size`设置过大时,可能会超出GPU的内存限制。根据你的硬件资源和模型复杂度,适当调整`batch_size`以避免内存溢出。 5. **数据预处理**:在构建dataloader之前,确保数据预处理步骤(如归一化、填充等)已经完成,这些预处理可能会影响批次的构造。 6. **多线程和多进程**:PyTorch的Dataloader支持多线程或多进程加载数据,以提高效率。如果设置不当,如`num_workers`过大,可能会导致进程间的同步问题,进而影响`batch_size`的正常提取。 7. **异常处理**:在训练循环中添加适当的异常处理代码,例如`try-except`块,可以帮助捕获并解决因`batch_size`问题导致的错误。 解决这些问题的方法包括: - 检查并确保`batch_size`设置正确,不超过数据集的大小。 - 确保数据集已正确切分,并在dataloader中使用正确的数据子集。 - 根据需求调整`shuffle`和`drop_last`参数。 - 监控GPU内存使用情况,避免`batch_size`过大导致内存溢出。 - 检查数据预处理步骤,确保批次构建无误。 - 调整`num_workers`以平衡数据加载速度和系统资源。 - 添加异常处理代码,以便在出现问题时能及时捕获和恢复。 通过排查以上问题,通常可以解决PyTorch中与`batch_size`相关的bug。如果问题依然存在,进一步检查代码逻辑或查阅PyTorch官方文档和社区论坛寻求帮助也是一个有效的途径。

相关推荐

filetype

import scipy.io import numpy as np import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset 1. 加载MAT文件(保持不变) def load_matlab_data(file_path): data = scipy.io.loadmat(file_path) csi = np.squeeze(data[‘csi’]) allocations = np.squeeze(data[‘allocations’]) symbols = np.squeeze(data[‘symbols_with_channel’]) snr = np.squeeze(data[‘snr’]) return csi, allocations, symbols, snr 2. 数据预处理(重构后) def preprocess_data(csi, allocations, snr): csi_abs = np.abs(csi) snr_expanded = np.expand_dims(snr, axis=1).repeat(csi_abs.shape[1], axis=1) X = np.concatenate([csi_abs, snr_expanded], axis=-1) y = allocations return X, y 3. 定义LSTM模型(修正后) class LSTMModel(nn.Module): def init(self, input_dim, hidden_dim, output_dim, num_layers=2): super().init() self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True) self.fc = nn.Linear(hidden_dim, output_dim) def forward(self, x): out, _ = self.lstm(x) return self.fc(out) 4. 训练与验证(修正后) def train_model(model, X_train, y_train, num_epochs=50, batch_size=32, lr=1e-3): dataset = TensorDataset( torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long) ) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) for epoch in range(num_epochs): model.train() total_loss = 0 for batch_X, batch_y in dataloader: optimizer.zero_grad() outputs = model(batch_X) loss = criterion(outputs.permute(0, 2, 1), batch_y) loss.backward() optimizer.step() total_loss += loss.item() if (epoch + 1) % 10 == 0: print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}') def evaluate_model(model, X_test, y_test): model.eval() with torch.no_grad(): outputs = model(torch.tensor(X_test, dtype=torch.float32)) preds = outputs.argmax(dim=-1) accuracy = (preds == torch.tensor(y_test, dtype=torch.long)).float().mean() print(f’Test Accuracy: {accuracy.item():.4f}') 主函数(修正数据划分) def main(): csi, allocations, _, snr = load_matlab_data(‘ofdm_dataset_with_channel.mat’) X, y = preprocess_data(csi, allocations, snr) # 按时间顺序划分 split_idx = int(0.8 * len(X)) X_train, X_test = X[:split_idx], X[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:] model = LSTMModel( input_dim=X_train.shape[-1], # 输入维度为 num_users + 1 hidden_dim=128, output_dim=np.max(allocations) + 1 # 类别数 ) train_model(model, X_train, y_train) evaluate_model(model, X_test, y_test) if name == ‘main’: main()修改bug

filetype
filetype

import os import torch import transformers from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig, Trainer ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from datasets import load_dataset import logging import psutil import gc from datetime import datetime # === 配置区域 === MODEL_NAME = "/home/vipuser/ai_writer_project_final_with_fixed_output_ui/models/Yi-6B" DATASET_PATH = "./data/train_lora_formatted.jsonl" OUTPUT_DIR = "./yi6b-lora-optimized" DEVICE_MAP = "auto" # 使用自动设备映射 # 确保输出目录存在 os.makedirs(OUTPUT_DIR, exist_ok=True) # === 内存优化配置 === os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" # 减少内存碎片 torch.backends.cuda.cufft_plan_cache.clear() # 清理CUDA缓存 # === 增强的日志系统 === def setup_logging(output_dir): """配置日志系统,支持文件和TensorBoard""" logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # 文件日志处理器 file_handler = logging.FileHandler(os.path.join(output_dir, "training.log")) file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(file_handler) # 控制台日志处理器 console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) logger.addHandler(console_handler) # TensorBoard日志目录 tensorboard_log_dir = os.path.join(output_dir, "logs", datetime.now().strftime("%Y%m%d-%H%M%S")) os.makedirs(tensorboard_log_dir, exist_ok=True) # 安装TensorBoard回调 tb_writer = None try: from torch.utils.tensorboard import SummaryWriter tb_writer = SummaryWriter(log_dir=tensorboard_log_dir) logger.info(f"TensorBoard日志目录: {tensorboard_log_dir}") except ImportError: logger.warning("TensorBoard未安装,可视化功能不可用") return logger, tb_writer logger, tb_writer = setup_logging(OUTPUT_DIR) # === 量化配置 - 使用更高效的配置 === quant_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) # === 加载模型 === logger.info("加载预训练模型...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map=DEVICE_MAP, quantization_config=quant_config, torch_dtype=torch.bfloat16, trust_remote_code=True, attn_implementation="flash_attention_2" # 使用FlashAttention优化内存 ) # === 分词器处理 === logger.info("加载分词器...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer.padding_side = "right" if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id # === 准备模型训练 === model = prepare_model_for_kbit_training( model, use_gradient_checkpointing=True # 启用梯度检查点以节省内存 ) # === LoRA 配置 - 优化内存使用 === logger.info("配置LoRA...") lora_config = LoraConfig( r=64, # 降低rank以减少内存使用 lora_alpha=32, # 降低alpha值 target_modules=["q_proj", "v_proj"], # 减少目标模块 lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) # 记录可训练参数 trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) logger.info(f"可训练参数: {trainable_params:,} / 总参数: {total_params:,} ({trainable_params / total_params:.2%})") # === 加载并预处理数据集 === logger.info("加载和预处理数据集...") dataset = load_dataset("json", data_files=DATASET_PATH, split="train") # 文本过滤函数 def is_valid_text(example): text = example.get("text", "") return text is not None and len(text.strip()) > 200 # 增加最小长度要求 dataset = dataset.filter(is_valid_text) logger.info(f"过滤后数据集大小: {len(dataset)} 条") # 动态填充的分词函数 - 节省内存 def tokenize_function(examples): tokenized = tokenizer( examples["text"], padding=True, # 使用动态填充 truncation=True, max_length=1024, # 降低上下文长度以减少内存使用 ) # 创建 labels - 因果语言建模需要 labels = input_ids tokenized["labels"] = tokenized["input_ids"].copy() return tokenized tokenized_dataset = dataset.map( tokenize_function, batched=True, remove_columns=["text"], batch_size=64, # 降低批处理大小以减少内存峰值 num_proc=4, # 减少进程数以降低内存开销 ) # === 数据整理器 === data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False # 因果语言建模 ) # === 训练参数 - 优化内存使用 === report_to_list = ["tensorboard"] if tb_writer else [] training_args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=4, # 大幅降低批次大小 gradient_accumulation_steps=4, # 增加梯度累积步数以保持有效批次大小 learning_rate=2e-5, num_train_epochs=3, logging_steps=50, save_strategy="steps", save_steps=500, bf16=True, optim="paged_adamw_32bit", report_to=report_to_list, warmup_ratio=0.05, gradient_checkpointing=True, # 启用梯度检查点 fp16=False, max_grad_norm=0.3, # 降低梯度裁剪阈值 remove_unused_columns=True, # 移除未使用的列以节省内存 dataloader_num_workers=4, # 减少数据加载工作线程 evaluation_strategy="steps", eval_steps=500, save_total_limit=2, # 减少保存的检查点数量 logging_dir=os.path.join(OUTPUT_DIR, "logs"), load_best_model_at_end=True, ddp_find_unused_parameters=False, logging_first_step=True, group_by_length=True, lr_scheduler_type="cosine", weight_decay=0.01, ) # === GPU监控工具 === def monitor_gpu(): """监控GPU使用情况""" if torch.cuda.is_available(): device = torch.device("cuda") mem_alloc = torch.cuda.memory_allocated(device) / 1024 ** 3 mem_reserved = torch.cuda.memory_reserved(device) / 1024 ** 3 mem_total = torch.cuda.get_device_properties(device).total_memory / 1024 ** 3 return { "allocated": f"{mem_alloc:.2f} GB", "reserved": f"{mem_reserved:.2f} GB", "total": f"{mem_total:.2f} GB", "utilization": f"{mem_alloc / mem_total * 100:.1f}%" } return {} # === 创建训练器 === eval_dataset = None if len(tokenized_dataset) > 100: eval_dataset = tokenized_dataset.select(range(100)) trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=tokenized_dataset, eval_dataset=eval_dataset, data_collator=data_collator, ) # === 训练前验证 === def validate_data_and_model(): """验证数据和模型是否准备好训练""" logger.info("\n=== 训练前验证 ===") # 检查样本格式 sample = tokenized_dataset[0] logger.info(f"样本键: {list(sample.keys())}") logger.info(f"input_ids 长度: {len(sample['input_ids'])}") # 创建单个样本测试批次 test_batch = data_collator([sample]) # 移动数据到设备 test_batch = {k: v.to(model.device) for k, v in test_batch.items()} # 前向传播测试 model.train() outputs = model(**test_batch) loss_value = outputs.loss.item() logger.info(f"测试批次损失: {loss_value:.4f}") # 记录到TensorBoard if tb_writer: tb_writer.add_scalar("debug/test_loss", loss_value, 0) # 反向传播测试 outputs.loss.backward() logger.info("反向传播成功!") # 重置梯度 model.zero_grad() logger.info("验证完成,准备开始训练\n") # 记录初始GPU使用情况 gpu_status = monitor_gpu() logger.info(f"初始GPU状态: {gpu_status}") # 记录到TensorBoard if tb_writer: tb_writer.add_text("system/initial_gpu", str(gpu_status), 0) validate_data_and_model() # === 自定义回调 - 监控资源使用 === class ResourceMonitorCallback(transformers.TrainerCallback): def __init__(self, tb_writer=None): self.tb_writer = tb_writer self.start_time = datetime.now() self.last_log_time = datetime.now() def on_step_end(self, args, state, control, **kwargs): current_time = datetime.now() time_diff = (current_time - self.last_log_time).total_seconds() # 每分钟记录一次资源使用情况 if time_diff > 60: self.last_log_time = current_time # GPU监控 gpu_status = monitor_gpu() logger.info(f"Step {state.global_step} - GPU状态: {gpu_status}") # CPU和内存监控 cpu_percent = psutil.cpu_percent() mem = psutil.virtual_memory() logger.info( f"CPU使用率: {cpu_percent}%, 内存使用: {mem.used / 1024 ** 3:.2f}GB/{mem.total / 1024 ** 3:.2f}GB") # 记录到TensorBoard if self.tb_writer: # GPU显存使用 if torch.cuda.is_available(): device = torch.device("cuda") mem_alloc = torch.cuda.memory_allocated(device) / 1024 ** 3 self.tb_writer.add_scalar("system/gpu_mem", mem_alloc, state.global_step) # CPU使用率 self.tb_writer.add_scalar("system/cpu_usage", cpu_percent, state.global_step) # 系统内存使用 self.tb_writer.add_scalar("system/ram_usage", mem.used / 1024 ** 3, state.global_step) def on_log(self, args, state, control, logs=None, **kwargs): """记录训练指标到TensorBoard""" if self.tb_writer and logs is not None: for metric_name, metric_value in logs.items(): if "loss" in metric_name or "lr" in metric_name or "grad_norm" in metric_name: self.tb_writer.add_scalar(f"train/{metric_name}", metric_value, state.global_step) def on_train_end(self, args, state, control, **kwargs): """训练结束时记录总时间""" training_time = datetime.now() - self.start_time logger.info(f"训练总时间: {training_time}") if self.tb_writer: self.tb_writer.add_text("system/total_time", str(training_time)) # 添加回调 trainer.add_callback(ResourceMonitorCallback(tb_writer=tb_writer)) # === 内存清理函数 === def clear_memory(): """清理内存和GPU缓存""" gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() logger.info("内存清理完成") # === 启动训练 === try: logger.info("开始训练...") # 分阶段训练以减少内存峰值 num_samples = len(tokenized_dataset) chunk_size = 1000 # 每次处理1000个样本 for i in range(0, num_samples, chunk_size): end_idx = min(i + chunk_size, num_samples) logger.info(f"训练样本 {i} 到 {end_idx - 1} / {num_samples}") # 创建子数据集 chunk_dataset = tokenized_dataset.select(range(i, end_idx)) # 更新训练器 trainer.train_dataset = chunk_dataset # 训练当前块 trainer.train() # 清理内存 clear_memory() # 保存训练指标 metrics = trainer.evaluate() trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) # 保存最佳模型 trainer.save_model(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) logger.info(f"训练完成! 模型保存在: {OUTPUT_DIR}") # 记录最终指标到TensorBoard if tb_writer: for metric_name, metric_value in metrics.items(): tb_writer.add_scalar(f"final/{metric_name}", metric_value) tb_writer.close() except Exception as e: logger.error(f"训练出错: {e}") import traceback logger.error(traceback.format_exc()) # 尝试更小批量训练 logger.info("\n尝试更小批量训练...") small_dataset = tokenized_dataset.select(range(50)) trainer.train_dataset = small_dataset trainer.train() # 保存模型 trainer.save_model(f"{OUTPUT_DIR}_small") tokenizer.save_pretrained(f"{OUTPUT_DIR}_small") logger.info(f"小批量训练完成! 模型保存在: {OUTPUT_DIR}_small") # 记录错误到TensorBoard if tb_writer: tb_writer.add_text("error/exception", traceback.format_exc()) # 清理内存 clear_memory() # === 训练后验证 === def validate_final_model(): """验证训练后的模型""" logger.info("\n=== 训练后验证 ===") # 加载保存的模型 from peft import PeftModel # 仅加载基础模型配置 base_model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map=DEVICE_MAP, quantization_config=quant_config, torch_dtype=torch.bfloat16, trust_remote_code=True, load_in_4bit=True ) # 加载LoRA适配器 peft_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR) # 不再合并LoRA权重,直接使用 peft_model 推理 peft_model.eval() # 测试生成 prompt = "中国的首都是" inputs = tokenizer(prompt, return_tensors="pt").to(peft_model.device) outputs = peft_model.generate( **inputs, max_new_tokens=50, # 减少生成长度 temperature=0.7, top_p=0.9, repetition_penalty=1.2, do_sample=True ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) logger.info(f"提示: {prompt}") logger.info(f"生成结果: {generated}") # 记录到TensorBoard if tb_writer: tb_writer.add_text("validation/sample", f"提示: {prompt}\n生成: {generated}") # 更全面的测试 test_prompts = [ "人工智能的未来发展趋势是", "如何学习深度学习?", "写一个关于太空探索的短故事:" ] for i, test_prompt in enumerate(test_prompts): inputs = tokenizer(test_prompt, return_tensors="pt").to(peft_model.device) outputs = peft_model.generate( **inputs, max_new_tokens=100, # 减少生成长度 temperature=0.7, top_p=0.9, repetition_penalty=1.2, do_sample=True ) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) logger.info(f"\n提示: {test_prompt}\n生成: {generated_text}\n{'=' * 50}") # 记录到TensorBoard if tb_writer: tb_writer.add_text(f"validation/test_{i}", f"提示: {test_prompt}\n生成: {generated_text}") logger.info("验证完成") # 执行验证 validate_final_model() # 关闭TensorBoard写入器 if tb_writer: tb_writer.close() logger.info("TensorBoard日志已关闭") 这串代码怎么样?会发生RuntimeError: mat1 and mat2 shapes cannot be multiplied (3x4096 and 512x4096)这种错误吗?

filetype

``` import os # 指定使用一块卡 os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # 模型训练 import torch from MyDataset import MyDataset from torch.utils.data import DataLoader from MyModel import MyModel from transformers import BertTokenizer, AdamW print(torch.cuda.memory_summary()) # 查看显存分配情况 # 定义设备信息 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 定义训练的轮次(将整个数据集训练完一次为一轮) EPOCH = 5 # 加载字典和分词器 token = BertTokenizer.from_pretrained(r"C:\Users\asus\Desktop\Graduation project\bert-base-chinese\models--bert-base-chinese\snapshots\c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f") # 将传入的字符串进行编码 def collate_fn(data): sents = [i[0] for i in data] label = [i[1] for i in data] # 编码 data = token.batch_encode_plus( # 编码的句子 batch_text_or_text_pairs=sents, # 当句子长度大于max_length(上限是model_max_length)时,截断 truncation=True, max_length=512, # 一律补0到max_length padding="max_length", # 可取值为'pt',返回pytorch张量,'tf',返回tensorflow张量,'np',返回numpy数组,默认为None,返回字典List。 return_tensors="pt", # 返回序列长度 return_length=True, # input_ids, attention_mask, token_type_ids 默认返回 ) input_ids = data["input_ids"] attention_mask = data["attention_mask"] token_type_ids = data["token_type_ids"] # 将标签转为pytorch张量 tensor label = torch.LongTensor(label) return input_ids, attention_mask, token_type_ids, label # 创建数据集 train_dataset = MyDataset("train") train_loader = DataLoader( dataset=train_dataset, # 训练批次,该参数需要根据设备调整,GPU训练调整到显存利用率在90%以上,CPU训练内存不会溢出就行 batch_size=8, # 打乱数据集,每个批次数据进行打乱 shuffle=True, # 舍弃最后一个批次的数据,防止形状出错 每个批次10,一共111条,左右一个批次不够10条舍弃,保证每个批次数量都是10 drop_last=True, # 对加载的数据进行编码 collate_fn=collate_fn ) if __name__ == '__main__': #开始训练 print(DEVICE) model = MyModel().to(DEVICE) # 定义优化器 optimizer = AdamW(model.parameters()) # 定义损失函数 本次训练为二分类问题,使用多分类交叉熵函数 loss_func = torch.nn.CrossEntropyLoss() for epoch in range(EPOCH): for i, (input_ids, attention_mask, token_type_ids, label) in enumerate(train_loader): # 将数据放到DEVICE上面,和模型在同一个设备上 input_ids = input_ids.to(DEVICE) attention_mask = attention_mask.to(DEVICE) token_type_ids = token_type_ids.to(DEVICE) label = label.to(DEVICE) # 向前计算(将数据输入模型得到输出) output = model(input_ids, attention_mask, token_type_ids) # 根据输出计算损失 loss = loss_func(output, label) # 根据误差优化参数 一下三行为模型参与学习代码,如果去掉,则模型不会学习 # 清空梯度 optimizer.zero_grad() # 计算梯度 loss.backward() # 更新参数 optimizer.step() # 每隔5个批次输出训练信息 if i % 5 == 0: # 将输出转为预测标签 out = output.argmax(dim=1) # 计算训练精度 acc = (out == label).sum().item() / len(label) # 打印训练信息 轮次 损失 精度 print(f"epoch:{epoch},i:{i},loss:{loss.item()},acc:{acc}") # 每训练完一轮,保存一次参数 params 文件夹要手动创建,否则保存会报错 if not os.path.exists("params"): os.makedirs("params") # 正确保存state_dict print(f"开始保存模型:epoch {epoch}") torch.save(model.state_dict(), f"params/{epoch}_bert.pth") print(f"模型保存成功:epoch {epoch}") print(epoch, "参数保存成功!")```请解释这个代码内容

filetype

模块4Epoch 1/10 训练: 0%| | 0/1863 [00:05<?, ?it/s] --------------------------------------------------------------------------- Empty Traceback (most recent call last) File ~\.conda\envs\Anaconda\Lib\site-packages\torch\utils\data\dataloader.py:1243, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout) 1242 try: -> 1243 data = self._data_queue.get(timeout=timeout) 1244 return (True, data) File ~\.conda\envs\Anaconda\Lib\multiprocessing\queues.py:114, in Queue.get(self, block, timeout) 113 if not self._poll(timeout): --> 114 raise Empty 115 elif not self._poll(): Empty: The above exception was the direct cause of the following exception: RuntimeError Traceback (most recent call last) Cell In[10], line 79 76 print(f"\nEpoch {epoch+1}/{num_epochs}") 78 # 训练阶段 ---> 79 train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device) 80 train_losses.append(train_loss) 81 train_accs.append(train_acc) Cell In[10], line 19, in train_model(model, train_loader, criterion, optimizer, device) 16 correct = 0 17 total = 0 ---> 19 for inputs, labels in tqdm(train_loader, desc="训练"): 20 inputs, labels = inputs.to(device), labels.to(device) 22 optimizer.zero_grad() File ~\.conda\envs\Anaconda\Lib\site-packages\tqdm\std.py:1181, in tqdm.__iter__(self) 1178 time = self._time 1180 try: -> 1181 for obj in iterable: 1182 yield obj 1183 # Update and possibly print the progressbar. 1184 # Note: does not call self.update(1) for speed optimisation. File ~\.conda\envs\Anaconda\Lib\site-packages\torch\utils\data\dataloader.py:701, in _BaseDataLoaderIter.__next__(self) 698 if self._sampler_iter is None: 699 # TODO(https://github.com/pytorch/pytorch/issues/76750) 700 self._reset() # type: ignore[call-arg] --> 701 data = self._next_data() 702 self._num_yielded += 1 703 if ( 704 self._dataset_kind == _DatasetKind.Iterable 705 and self._IterableDataset_len_called is not None 706 and self._num_yielded > self._IterableDataset_len_called 707 ): File ~\.conda\envs\Anaconda\Lib\site-packages\torch\utils\data\dataloader.py:1448, in _MultiProcessingDataLoaderIter._next_data(self) 1445 return self._process_data(data) 1447 assert not self._shutdown and self._tasks_outstanding > 0 -> 1448 idx, data = self._get_data() 1449 self._tasks_outstanding -= 1 1450 if self._dataset_kind == _DatasetKind.Iterable: 1451 # Check for _IterableDatasetStopIteration File ~\.conda\envs\Anaconda\Lib\site-packages\torch\utils\data\dataloader.py:1412, in _MultiProcessingDataLoaderIter._get_data(self) 1408 # In this case, `self._data_queue` is a `queue.Queue`,. But we don't 1409 # need to call `.task_done()` because we don't use `.join()`. 1410 else: 1411 while True: -> 1412 success, data = self._try_get_data() 1413 if success: 1414 return data File ~\.conda\envs\Anaconda\Lib\site-packages\torch\utils\data\dataloader.py:1256, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout) 1254 if len(failed_workers) > 0: 1255 pids_str = ", ".join(str(w.pid) for w in failed_workers) -> 1256 raise RuntimeError( 1257 f"DataLoader worker (pid(s) {pids_str}) exited unexpectedly" 1258 ) from e 1259 if isinstance(e, queue.Empty): 1260 return (False, None) RuntimeError: DataLoader worker (pid(s) 28620, 21908, 29584, 29632) exited unexpectedly

filetype

C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\thinc\shims\pytorch.py:261: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. model.load_state_dict(torch.load(filelike, map_location=device)) Traceback (most recent call last): File "C:\Users\86159\Desktop\Learn_pytorch\train.py", line 41, in <module> train() File "C:\Users\86159\Desktop\Learn_pytorch\train.py", line 24, in train for batch in train_loader: File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\dataloader.py", line 701, in __next__ data = self._next_data() ^^^^^^^^^^^^^^^^^ File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\dataloader.py", line 757, in _next_data data = self._dataset_fetcher.fetch(index) # may raise StopIteration ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch data = [self.dataset[idx] for idx in possibly_batched_index] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\86159\anaconda3\envs\pytorch\Lib\site-packages\torch\utils\data\_uti

filetype

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 97 closing signal SIGTERM ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -11) local_rank: 1 (pid: 98) of binary: /opt/conda/bin/python Traceback (most recent call last): File "/opt/conda/bin/torchrun", line 33, in <module> sys.exit(load_entry_point('torch==1.13.1+cu116', 'console_scripts', 'torchrun')()) File "/opt/conda/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper return f(*args, **kwargs) File "/opt/conda/lib/python3.9/site-packages/torch/distributed/run.py", line 762, in main run(args) File "/opt/conda/lib/python3.9/site-packages/torch/distributed/run.py", line 753, in run elastic_launch( File "/opt/conda/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/opt/conda/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ==================================================== tools/train.py FAILED ---------------------------------------------------- Failures: <NO_OTHER_FAILURES> ---------------------------------------------------- Root Cause (first observed failure): [0]: time : 2025-03-09_20:49:13 host : yons-MS-7E06 rank : 1 (local_rank: 1) exitcode : -11 (pid: 98) error_file: <N/A> traceback : Signal 11 (SIGSEGV) received by PID 98 ====================================================这是什么问题,怎么解决

filetype

Traceback (most recent call last): File "D:\ANAACONDA\Lib\site-packages\torch\utils\data\dataloader.py", line 1251, in _try_get_data data = self._data_queue.get(timeout=timeout) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\ANAACONDA\Lib\queue.py", line 179, in get raise Empty _queue.Empty The above exception was the direct cause of the following exception: Traceback (most recent call last): File "D:\YOLOv11\YOLOv11源码文件(2024.10.29)\train.py", line 14, in <module> model.train(data='ultralytics/cfg/datasets/NEU-DET.yaml', # 指定训练数据集的配置文件路径,这个.yaml文件包含了数据集的路径和类别信息 File "D:\YOLOv11\YOLOv11源码文件(2024.10.29)\ultralytics\engine\model.py", line 802, in train self.trainer.train() File "D:\YOLOv11\YOLOv11源码文件(2024.10.29)\ultralytics\engine\trainer.py", line 207, in train self._do_train(world_size) File "D:\YOLOv11\YOLOv11源码文件(2024.10.29)\ultralytics\engine\trainer.py", line 367, in _do_train for i, batch in pbar: ^^^^ File "D:\ANAACONDA\Lib\site-packages\tqdm\std.py", line 1181, in __iter__ for obj in iterable: File "D:\YOLOv11\YOLOv11源码文件(2024.10.29)\ultralytics\data\build.py", line 48, in __iter__ yield next(self.iterator) ^^^^^^^^^^^^^^^^^^^ File "D:\ANAACONDA\Lib\site-packages\torch\utils\data\dataloader.py", line 708, in __next__ data = self._next_data() ^^^^^^^^^^^^^^^^^ File "D:\ANAACONDA\Lib\site-packages\torch\utils\data\dataloader.py", line 1458, in _next_data idx, data = self._get_data() ^^^^^^^^^^^^^^^^ File "D:\ANAACONDA\Lib\site-packages\torch\utils\data\dataloader.py", line 1410, in _get_data success, data = self._try_get_data() ^^^^^^^^^^^^^^^^^^^^ File "D:\ANAACONDA\Lib\site-packages\torch\utils\data\dataloader.py", line 1264, in _try_get_data raise RuntimeError( RuntimeError: DataLoader worker (pid(s) 37572) exited unexpectedly

filetype
liu伟鹏
  • 粉丝: 24
上传资源 快速赚钱