import torch import torch.nn as nn from torchtext.datasets import AG_NEWS from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator # 数据预处理 tokenizer = get_tokenizer('basic_english') train_iter = AG_NEWS(split='train') counter = Counter() for (label, line) in train_iter: counter.update(tokenizer(line)) vocab = build_vocab_from_iterator([counter], specials=["<unk>"]) word2idx = dict(vocab.stoi) # 设定超参数 embedding_dim = 64 hidden_dim = 128 num_epochs = 10 batch_size = 64 # 定义模型 class RNN(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim): super(RNN, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True) self.fc = nn.Linear(hidden_dim, 4) def forward(self, x): x = self.embedding(x) out, _ = self.rnn(x) out = self.fc(out[:, -1, :]) return out # 初始化模型、优化器和损失函数 model = RNN(len(vocab), embedding_dim, hidden_dim) optimizer = torch.optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() # 定义数据加载器 train_iter = AG_NEWS(split='train') train_data = [] for (label, line) in train_iter: label = torch.tensor([int(label)-1]) line = torch.tensor([word2idx[word] for word in tokenizer(line)]) train_data.append((line, label)) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True) # 开始训练 for epoch in range(num_epochs): total_loss = 0.0 for input, target in train_loader: model.zero_grad() output = model(input) loss = criterion(output, target.squeeze()) loss.backward() optimizer.step() total_loss += loss.item() * input.size(0) print("Epoch: {}, Loss: {:.4f}".format(epoch+1, total_loss/len(train_data)))改错

import torch import torch.nn as nn import math from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torchtext.datasets import Multi30k from typing import Iterable, List from torch import Tensor from torch.nn import Transformer from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader from timeit import default_timer as timer DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') SRC_LANGUAGE = 'de' # 源语言是德语 TGT_LANGUAGE = 'en' # 目标语言是英语 # 定义token的字典, 定义vocab字典 token_transform = {} vocab_transform = {} # 创建源语言和目标语言的tokenizer, 确保依赖关系已经安装 # pip install -U spacy # python -m spacy download en_core_web_sm # python -m spacy download de_core_news_sm # get_tokenizer是分词函数, 如果没有特殊的则按照英语的空格分割, 如果有这按照对应的分词库返回. # 比如spacy, 返回对应的分词库 token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm') token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torchtext.datasets import Multi30k • $get_tokenizer$：获取文本分词器（支持spacy等工具） ...

构建PyTorch DataPipelines高手指南：复杂数据处理的终极解决方案

!...) # 1. PyTorch DataPipelines概述 ## 1.1 数据管道在深度学习中的作用数据管道是深度学习工作流程中不可或缺的一部分，它...PyTorch作为目前流行的深度学习框架，通过DataPipelines为用户提供了强大的数据处理能

深度学习框架对决：TensorFlow vs. PyTorch vs. Keras

![深度学习框架对决：TensorFlow vs. PyTorch vs.... # 摘要本文综述了三种主流深度学习框架：TensorFlow、PyTorch和Keras，从它们的理论基础和实践应用两个方面进行了全面的探讨。首先概述了深度学习框架的发展背景与...

【4. 分类机制】多标签分类的挑战与策略：标签平衡与损失函数调整

![YOLOv8的边界框回归与分类]...# 1. 多标签分类概述多标签分类是机器学习领域的一项核心任务，它要求模型能够识别出一个实例属于多个类别中的任意一个或多于一个。与传统

/home/wiseatc/.local/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. import pkg_resources W0703 16:30:36.069853 3914856 torch/distributed/run.py:766] W0703 16:30:36.069853 3914856 torch/distributed/run.py:766] * W0703 16:30:36.069853 3914856 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0703 16:30:36.069853 3914856 torch/distributed/run.py:766] * /home/wiseatc/.local/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. import pkg_resources [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,321 >> loading file tokenizer.model [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,322 >> loading file tokenizer.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,322 >> loading file added_tokens.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,322 >> loading file special_tokens_map.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,322 >> loading file tokenizer_config.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,322 >> loading file chat_template.jinja /home/wiseatc/.local/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. import pkg_resources /home/wiseatc/.local/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. import pkg_resources /home/wiseatc/.local/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81. import pkg_resources [INFO|tokenization_utils_base.py:2313] 2025-07-03 16:30:43,904 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. [INFO|configuration_utils.py:697] 2025-07-03 16:30:43,913 >> loading configuration file /mnt/data1/models/1.5B/config.json [INFO|configuration_utils.py:771] 2025-07-03 16:30:43,919 >> Model config Qwen2Config { "_name_or_path": "/mnt/data1/models/1.5B", "architectures": [ "Qwen2ForCausalLM" ], "attention_dropout": 0.0, "bos_token_id": 151643, "eos_token_id": 151643, "hidden_act": "silu", "hidden_size": 1536, "initializer_range": 0.02, "intermediate_size": 8960, "max_position_embeddings": 131072, "max_window_layers": 21, "model_type": "qwen2", "num_attention_heads": 12, "num_hidden_layers": 28, "num_key_value_heads": 2, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 10000, "sliding_window": 4096, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.49.0", "use_cache": true, "use_mrope": false, "use_sliding_window": false, "vocab_size": 151936 } [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,920 >> loading file tokenizer.model [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,920 >> loading file tokenizer.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,920 >> loading file added_tokens.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,920 >> loading file special_tokens_map.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,920 >> loading file tokenizer_config.json [INFO|tokenization_utils_base.py:2048] 2025-07-03 16:30:43,920 >> loading file chat_template.jinja [INFO|tokenization_utils_base.py:2313] 2025-07-03 16:30:44,493 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. /usr/local/lib/python3.11/dist-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via init_process_group or barrier . Using the current device set by the user. warnings.warn( # warn only once [rank1]:[W703 16:30:45.102845887 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. /usr/local/lib/python3.11/dist-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via init_process_group or barrier . Using the current device set by the user. warnings.warn( # warn only once [rank2]:[W703 16:30:45.126706430 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. /usr/local/lib/python3.11/dist-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via init_process_group or barrier . Using the current device set by the user. warnings.warn( # warn only once [rank3]:[W703 16:30:45.136836682 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. Setting num_proc from 16 back to 1 for the train split to disable multiprocessing as it only contains one shard. Generating train split: 0 examples [00:00, ? examples/s] Generating train split: 120 examples [00:00, 6525.39 examples/s] Converting format of dataset (num_proc=16): 0%| | 0/120 [00:00<?, ? examples/s] Converting format of dataset (num_proc=16): 0%| | 0/120 [00:00<?, ? examples/s] Converting format of dataset (num_proc=16): 0%| | 0/120 [00:00<?, ? examples/s] /usr/local/lib/python3.11/dist-packages/torch/distributed/distributed_c10d.py:4631: UserWarning: No device id is provided via init_process_group or barrier . Using the current device set by the user. warnings.warn( # warn only once [rank0]:[W703 16:31:05.679961201 ProcessGroupNCCL.cpp:4718] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device. [rank0]: multiprocess.pool.RemoteTraceback: [rank0]: """ [rank0]: Traceback (most recent call last): [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/multiprocess/pool.py", line 125, in worker [rank0]: result = (True, func(args, kwds)) [rank0]: ^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/utils/py_utils.py", line 688, in _write_generator_to_queue [rank0]: for i, result in enumerate(func(kwargs)): [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3501, in _map_single [rank0]: for i, example in iter_outputs(shard_iterable): [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3475, in iter_outputs [rank0]: yield i, apply_function(example, i, offset=offset) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3398, in apply_function [rank0]: processed_inputs = function(fn_args, additional_args, fn_kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/data/converter.py", line 94, in call [rank0]: if self.dataset_attr.prompt and example[self.dataset_attr.prompt]: [rank0]: ~^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/formatting/formatting.py", line 278, in getitem [rank0]: value = self.data[key] [rank0]: ~~~^^^^^ [rank0]: KeyError: 'instruction' [rank0]: """ [rank0]: The above exception was the direct cause of the following exception: [rank0]: Traceback (most recent call last): [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/launcher.py", line 23, in <module> [rank0]: launch() [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/launcher.py", line 19, in launch [rank0]: run_exp() [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/train/tuner.py", line 110, in run_exp [rank0]: _training_function(config={"args": args, "callbacks": callbacks}) [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/train/tuner.py", line 72, in _training_function [rank0]: run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/train/sft/workflow.py", line 51, in run_sft [rank0]: dataset_module = get_dataset(template, model_args, data_args, training_args, stage="sft", tokenizer_module) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/data/loader.py", line 304, in get_dataset [rank0]: dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/data/loader.py", line 182, in _get_merged_dataset [rank0]: datasets[dataset_name] = _load_single_dataset(dataset_attr, model_args, data_args, training_args) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/data/loader.py", line 162, in _load_single_dataset [rank0]: return align_dataset(dataset, dataset_attr, data_args, training_args) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/LLaMA-Factory/src/llamafactory/data/converter.py", line 279, in align_dataset [rank0]: return dataset.map( [rank0]: ^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 557, in wrapper [rank0]: out: Union["Dataset", "DatasetDict"] = func(self, args, **kwargs) [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/arrow_dataset.py", line 3171, in map [rank0]: for rank, done, content in iflatmap_unordered( [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/utils/py_utils.py", line 728, in iflatmap_unordered [rank0]: [async_result.get(timeout=0.05) for async_result in async_results] [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/datasets/utils/py_utils.py", line 728, in [rank0]: [async_result.get(timeout=0.05) for async_result in async_results] [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [rank0]: File "/home/wiseatc/.local/lib/python3.11/site-packages/multiprocess/pool.py", line 774, in get [rank0]: raise self._value [rank0]: KeyError: 'instruction' [rank0]:[W703 16:31:06.912491219 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) W0703 16:31:07.960560 3914856 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 3914916 closing signal SIGTERM W0703 16:31:07.961188 3914856 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 3914917 closing signal SIGTERM W0703 16:31:07.961536 3914856 torch/distributed/elastic/multiprocessing/api.py:900] Sending process 3914918 closing signal SIGTERM E0703 16:31:08.371267 3914856 torch/distributed/elastic/multiprocessing/api.py:874] failed (exitcode: 1) local_rank: 0 (pid: 3914915) of binary: /usr/bin/python3.11 Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) ^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper return f(*args, **kwargs) ^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/distributed/run.py", line 892, in main run(args) File "/usr/local/lib/python3.11/dist-packages/torch/distributed/run.py", line 883, in run elastic_launch( File "/usr/local/lib/python3.11/dist-packages/torch/distributed/launcher/api.py", line 139, in call return launch_agent(self._config, self._entrypoint, list(args)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.11/dist-packages/torch/distributed/launcher/api.py", line 270, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ /home/wiseatc/LLaMA-Factory/src/llamafactory/launcher.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2025-07-03_16:31:07 host : wiseatc-Super-Server rank : 0 (local_rank: 0) exitcode : 1 (pid: 3914915) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================ Traceback (most recent call last): File "/home/wiseatc/.local/bin/llamafactory-cli", line 8, in <module> sys.exit(main()) ^^^^^^ File "/home/wiseatc/LLaMA-Factory/src/llamafactory/cli.py", line 130, in main process = subprocess.run( ^^^^^^^^^^^^^^^ File "/usr/lib/python3.11/subprocess.py", line 569, in run raise CalledProcessError(retcode, process.args, subprocess.CalledProcessError: Command '['torchrun', '--nnodes', '1', '--node_rank', '0', '--nproc_per_node', '4', '--master_addr', '127.0.0.1', '--master_port', '41919', '/home/wiseatc/LLaMA-Factory/src/llamafactory/launcher.py', 'saves/DeepSeek-R1-1.5B-Distill/lora/train_2025-07-03-16-29-46/training_args.yaml']' returned non-zero exit status 1.

from llamafactory import get_template template = get_template("qwen2") # 与模型类型一致 print(template.prompt_template) 3. **预检脚本** 添加数据验证环节： python REQUIRED_FIELDS = [...

我已经下载了tiktoken和protobuf库，D:\PythonProject\deepseekai.venv\Scripts\python.exe D:\PythonProject\deepseekai\train_weather_model.py PyTorch 版本: 2.3.1+cu118 CUDA 可用: True GPU 名称: NVIDIA GeForce GTX 1650 Ti You are using the default legacy behaviour of the <class ‘transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast’>. This is expected, and simply means that the legacy (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set legacy=False. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message. Traceback (most recent call last): File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\convert_slow_tokenizer.py”, line 1737, in convert_slow_tokenizer ).converted() ^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\convert_slow_tokenizer.py”, line 1631, in converted tokenizer = self.tokenizer() ^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\convert_slow_tokenizer.py”, line 1624, in tokenizer vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\convert_slow_tokenizer.py”, line 1600, in extract_vocab_merges_from_model bpe_ranks = load_tiktoken_bpe(tiktoken_url) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\tiktoken\load.py”, line 148, in load_tiktoken_bpe contents = read_file_cached(tiktoken_bpe_file, expected_hash) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\tiktoken\load.py”, line 48, in read_file_cached cache_key = hashlib.sha1(blobpath.encode()).hexdigest() ^^^^^^^^^^^^^^^ AttributeError: ‘NoneType’ object has no attribute ‘encode’ During handling of the above exception, another exception occurred: Traceback (most recent call last): File “D:\PythonProject\deepseekai\train_weather_model.py”, line 31, in <module> tokenizer = AutoTokenizer.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\models\auto\tokenization_auto.py”, line 1032, in from_pretrained return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\tokenization_utils_base.py”, line 2025, in from_pretrained return cls._from_pretrained( ^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\tokenization_utils_base.py”, line 2278, in _from_pretrained tokenizer = cls(*init_inputs, **init_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\models\llama\tokenization_llama_fast.py”, line 154, in init super().init( File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\tokenization_utils_fast.py”, line 139, in init fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File “D:\PythonProject\deepseekai.venv\Lib\site-packages\transformers\convert_slow_tokenizer.py”, line 1739, in convert_slow_tokenizer raise ValueError( ValueError: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: [‘AlbertTokenizer’, ‘BartTokenizer’, ‘BarthezTokenizer’, ‘BertTokenizer’, ‘BigBirdTokenizer’, ‘BlenderbotTokenizer’, ‘CamembertTokenizer’, ‘CLIPTokenizer’, ‘CodeGenTokenizer’, ‘ConvBertTokenizer’, ‘DebertaTokenizer’, ‘DebertaV2Tokenizer’, ‘DistilBertTokenizer’, ‘DPRReaderTokenizer’, ‘DPRQuestionEncoderTokenizer’, ‘DPRContextEncoderTokenizer’, ‘ElectraTokenizer’, ‘FNetTokenizer’, ‘FunnelTokenizer’, ‘GPT2Tokenizer’, ‘HerbertTokenizer’, ‘LayoutLMTokenizer’, ‘LayoutLMv2Tokenizer’, ‘LayoutLMv3Tokenizer’, ‘LayoutXLMTokenizer’, ‘LongformerTokenizer’, ‘LEDTokenizer’, ‘LxmertTokenizer’, ‘MarkupLMTokenizer’, ‘MBartTokenizer’, ‘MBart50Tokenizer’, ‘MPNetTokenizer’, ‘MobileBertTokenizer’, ‘MvpTokenizer’, ‘NllbTokenizer’, ‘OpenAIGPTTokenizer’, ‘PegasusTokenizer’, ‘Qwen2Tokenizer’, ‘RealmTokenizer’, ‘ReformerTokenizer’, ‘RemBertTokenizer’, ‘RetriBertTokenizer’, ‘RobertaTokenizer’, ‘RoFormerTokenizer’, ‘SeamlessM4TTokenizer’, ‘SqueezeBertTokenizer’, ‘T5Tokenizer’, ‘UdopTokenizer’, ‘WhisperTokenizer’, ‘XLMRobertaTokenizer’, ‘XLNetTokenizer’, ‘SplinterTokenizer’, ‘XGLMTokenizer’, ‘LlamaTokenizer’, ‘CodeLlamaTokenizer’, ‘GemmaTokenizer’, ‘Phi3Tokenizer’] Process finished with exit code 1

from datasets import Dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, LlamaTokenizer ) from peft import LoraConfig, get_...

D:\PythonProject\deepseekai\.venv\Scripts\python.exe D:\PythonProject\deepseekai\train_weather_model.py 模型文件已复制到: ./local-deepseek-model\model.safetensors 配置文件已创建: config.json 分词器配置文件已创建: tokenizer_config.json You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the legacy (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set legacy=False. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message Traceback (most recent call last): File "D:\PythonProject\deepseekai\train_weather_model.py", line 68, in <module> tokenizer = AutoTokenizer.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\models\auto\tokenization_auto.py", line 1013, in from_pretrained return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\tokenization_utils_base.py", line 2025, in from_pretrained return cls._from_pretrained( ^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\tokenization_utils_base.py", line 2063, in _from_pretrained slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\tokenization_utils_base.py", line 2278, in _from_pretrained tokenizer = cls(*init_inputs, **init_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\models\llama\tokenization_llama.py", line 171, in init self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\models\llama\tokenization_llama.py", line 198, in get_spm_processor tokenizer.Load(self.vocab_file) File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\sentencepiece\init.py", line 961, in Load return self.LoadFromFile(model_file) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\sentencepiece\init.py", line 316, in LoadFromFile return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: not a string Process finished with exit code 1

from datasets import Dataset import sentencepiece as spm # 1. 配置模型路径 ollama_blobs_dir = r"D:\OllamaModels\blobs" model_hash = "sha256-aabd4debf0c8f08881923f2c25fc0fdeed24435271c2b3e92c4af...

D:\PythonProject\deepseekai\.venv\Scripts\python.exe D:\PythonProject\deepseekai\train_weather_model.py sentencepiece 已安装: 0.2.0 创建/下载: tokenizer.model 下载失败: 401 Client Error: Unauthorized for url: https://huggingface.co/deepseek-ai/deepseek-llm-1.3b-base/resolve/main/tokenizer.model 无法下载 tokenizer.model, 请手动下载 Traceback (most recent call last): File "D:\PythonProject\deepseekai\train_weather_model.py", line 134, in <module> tokenizer = LlamaTokenizer.from_pretrained( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\tokenization_utils_base.py", line 2025, in from_pretrained return cls._from_pretrained( ^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\tokenization_utils_base.py", line 2278, in _from_pretrained tokenizer = cls(*init_inputs, **init_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\models\llama\tokenization_llama.py", line 171, in init self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\transformers\models\llama\tokenization_llama.py", line 198, in get_spm_processor tokenizer.Load(self.vocab_file) File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\sentencepiece\init.py", line 961, in Load return self.LoadFromFile(model_file) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\PythonProject\deepseekai\.venv\Lib\site-packages\sentencepiece\init.py", line 316, in LoadFromFile return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: not a string Process finished with exit code 1

from datasets import Dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig ) from peft import LoraConfig, get_peft_model # 1. ...

torch与torchtext

from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator train_iter = AG_NEWS(split='train') tokenizer = get_tokenizer('basic_english') def yield_tokens...

AttributeError: module 'torchtext' has no attribute 'data'

from torchtext.data.utils import get_tokenizer from collections import Counter from torchtext.vocab import Vocab train_iter = AG_NEWS(split='train') counter = Counter() tokenizer = get_tokenizer('...

dataloader = data.BucketIterator(dataset, batch_size=32, shuffle=True) 这段代码中的dataset是什么形式？给出一个例子

from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torchtext.data import Field, LabelField # 先创建一个tokenizer和vocab对象 tokenizer = get_...

torchtext的SST2类的使用

from torchtext.data.utils import get_tokenizer from torchtext.vocab import GloVe 2. 下载和加载数据集 python train_data, test_data = SST2.splits( root='data', train='train.tsv', test='test....

torch加载imdb

from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator # 加载IMDb数据集 train_dataset, test_dataset = IMDB() # 定义分词器和词汇表 tokenizer = get_...

The following NEW packages will be INSTALLED: done # # To activate this environment, use # # $ conda activate torch_env # # To deactivate an active environment, use # # $ conda deactivate

from torchtext.legacy.data.utils import get_tokenizer from torchtext.legacy.vocab import GloVe, build_vocab_from_iterator # ... [其余代码保持原样，包括模型定义] ... # === 数据集加载修正 === def load...

Traceback (most recent call last): File "D:\pythonProject1\main.py", line 10, in <module> train_iter = WikiText2(root='./data', split='train') ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 怎么解决？

from torchtext.data.utils import get_tokenizer # 新版数据集加载（需指定split参数） train_iter, valid_iter, test_iter = WikiText2(split=('train', 'valid', 'test')) # 定义tokenizer（示例使用基础分词）...

用torch中的DataLoader划分文本数据集，并将数据集打印成txt文件

tokenizer = torchtext.data.utils.get_tokenizer("basic_english") # 分词器 train_data.fields["text"].build_vocab( train_data, max_size=MAX_VOCAB_SIZE, tokenizer=tokenizer ) # 将文本序列化为数字...

基于单片机的水位自动检测与控制系统开题报告.doc

相关推荐

python torch.utils.data.DataLoader使用方法

利用torch.utils.data.Dataset自定义数据加载类

PyTorch里面的torch.nn.Parameter()详解

构建PyTorch DataPipelines高手指南：复杂数据处理的终极解决方案

深度学习框架对决：TensorFlow vs. PyTorch vs. Keras

【4. 分类机制】多标签分类的挑战与策略：标签平衡与损失函数调整

torch与torchtext

AttributeError: module 'torchtext' has no attribute 'data'

dataloader = data.BucketIterator(dataset, batch_size=32, shuffle=True) 这段代码中的dataset是什么形式？给出一个例子

torchtext的SST2类的使用

torch加载imdb

The following NEW packages will be INSTALLED: done # # To activate this environment, use # # $ conda activate torch_env # # To deactivate an active environment, use # # $ conda deactivate

Traceback (most recent call last): File "D:\pythonProject1\main.py", line 10, in <module> train_iter = WikiText2(root='./data', split='train') ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 怎么解决？

用torch中的DataLoader划分文本数据集，并将数据集打印成txt文件

基于单片机的水位自动检测与控制系统开题报告.doc

大家在看

常用的网络拓朴图素材.zip

最新VISIO各种图形图标大集合.

2020_0610_应对新兴毫米波应用的测试挑战.pdf

国家/地区：国家/地区信息应用

Xilinx 7系列FPGA手册[打包下载]

最新推荐

基于单片机的水位自动检测与控制系统开题报告.doc

cc65 Windows完整版发布：6502 C开发工具

【CLIP模型实战】：从数据预处理到代码实现的图文相似度计算完全指南

车载以太网doip协议格式

JavaScript中文帮助手册：初学者实用指南

深入理解MySQL存储引擎：InnoDB与MyISAM的终极对决

window中系统中断，cpu占用100%

C++Builder6.0缺失帮助文件的解决方案

【湖北专升本MySQL强化训练】：5大SQL语句编写技巧，迅速提升实战能力

HFSS如何设置网格化细化