基于病名、药名的数据增强+模型融合+训练时-测试时增强+伪标签的文本相似性判定大赛（python源码+说明文档）.zip资源-CSDN文库

共24个文件

py：15个

pyc：7个

sh：1个

版权申诉

python

源码

114 浏览量 2025-05-27 19:31:36 上传评论收藏 75KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于病名、药名的数据增强+模型融合+训练时-测试时增强+伪标签的文本相似性判定大赛（python源码+说明文档）.zip （24个子文件）

public_tianchi

说明文档.md 10KB

code

data_augmentation.py 3KB

data

__init__.py 739B

metrics

__init__.py 3KB

squad_metrics.py 28KB

__pycache__

squad_metrics.cpython-37.pyc 16KB

__init__.cpython-37.pyc 2KB

processors

utils.py 14KB

__init__.py 578B

squad.py 27KB

xnli.py 3KB

glue.py 22KB

__pycache__

glue.cpython-37.pyc 18KB

xnli.cpython-37.pyc 2KB

__init__.cpython-37.pyc 721B

utils.cpython-37.pyc 12KB

squad.cpython-37.pyc 18KB

utils

__init__.py 4B

adv.py 3KB

run_glue_for_test.py 33KB

run.py 12KB

drugs_crawler.py 1KB

run.sh 250B

models_config.py 392B

# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa, Albert, XLM-RoBERTa).""" import argparse import glob import json import logging import os import random import numpy as np import pandas as pd import torch from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange from models_config import RESULT_PATH, LOGITS_PATH from utils.adv import FGM, PGD from transformers import ( WEIGHTS_NAME, AdamW, AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer, BertConfig, BertForSequenceClassification, BertTokenizer, DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer, RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, XLMConfig, XLMForSequenceClassification, XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer, XLMTokenizer, XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer, get_linear_schedule_with_warmup, ) from transformers import glue_compute_metrics as compute_metrics from transformers import glue_convert_examples_to_features as convert_examples_to_features from transformers import glue_output_modes as output_modes from transformers import glue_processors as processors try: from torch.utils.tensorboard import SummaryWriter except ImportError: from tensorboardX import SummaryWriter logger = logging.getLogger(__name__) ALL_MODELS = sum( ( tuple(conf.pretrained_config_archive_map.keys()) for conf in ( BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig, AlbertConfig, XLMRobertaConfig, FlaubertConfig, ) ), (), ) MODEL_CLASSES = { "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), "flaubert": (FlaubertConfig, FlaubertForSequenceClassification, FlaubertTokenizer), } def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) except: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 if args.adv_fgm: fgm = FGM(model) elif args.adv_pgd: pgd = PGD(model) model.ze

评论收藏

内容反馈

版权申诉