class NeuralDictionary(nn.Module): def __init__(self, num_keys, d_model, update_stride=100, update_size=5, momentum=0.2, top_k=None): super().__init__() self.num_keys = num_keys self.d_model = d_model self.update_stride = update_stride self.update_size = update_size self.momentum = momentum self.top_k = top_k # 初始化记忆库（键和值） self.keys = nn.Parameter(torch.randn(num_keys, d_model)) self.values = nn.Parameter(torch.randn(num_keys, d_model) * 0.01) # 使用频率记录和更新计数器 self.register_buffer('usage_counter', torch.zeros(num_keys)) self.register_buffer('update_count', torch.tensor(0)) def forward(self, query, context=None): attn_scores = torch.matmul(query, self.keys.T) # 如果传入了 context，则计算 context 与记忆槽的余弦相似度 if context is not None: # context 与每个记忆槽的余弦相似度: (B, num_keys) context_sim = F.cosine_similarity(context.unsqueeze(1), self.keys.unsqueeze(0), dim=2) # 将两者结合，这里采用乘法策略，使得只有同时高的槽位得分较高 attn_scores = attn_scores * context_sim # 如果设置了 top_k 且 top_k 小于总记忆槽数量，则只选取 top_k 得分最高的槽位 if self.top_k is not None and self.top_k < self.num_keys: topk_scores, topk_indices = torch.topk(attn_scores, self.top_k, dim=-1) # 构造 mask：将非 top_k 的位置置为 -inf mask = torch.full_like(attn_scores, float('-inf')) mask.scatter_(-1, topk_indices, topk_scores) attn = F.softmax(mask, dim=-1) else: attn = F.softmax(attn_scores, dim=-1) # 根据注意力权重从记忆库中提取信息 memory_output = torch.matmul(attn, self.values) # (B, d_model) # 2. 在训练时更新记忆槽使用频率 if self.training: with torch.no_grad(): self.usage_counter += attn.sum(dim=0) # 3. 动态更新记忆（如果 context 提供，则使用 context 更新记忆库） if self.training and context is not None: self._update_memory(context) return memory_output def _update_memory(self, new_context): """根据更新计数器和使用频率动态更新记忆库""" self.update_count += 1 if self.update_count % self.update_stride == 0: with torch.no_grad(): # 选择使用频率最低的槽位 _, replace_indices = torch.topk(self.usage_counter, self.update_size, largest=False) # 通过 _select_best_context 选择与待替换槽位最相似的上下文样本 sample_idx = self._select_best_context(new_context, replace_indices) sampled_context = new_context[sample_idx] # (update_size, d_model) # 动量更新记忆库：平滑融合旧记忆和新采样信息 self.keys.data[replace_indices] = (1 - self.momentum) * self.keys.data[replace_indices] + \ self.momentum * sampled_context self.values.data[replace_indices] = (1 - self.momentum) * self.values.data[replace_indices] + \ self.momentum * sampled_context.mean(dim=1, keepdim=True) # 重置更新的槽位的使用频率计数 self.usage_counter[replace_indices] = 0 def _select_best_context(self, new_context, replace_indices): # 计算所有 new_context 向量与待替换槽位的相似度 cos_sim = F.cosine_similarity( new_context.unsqueeze(1), # (B, 1, d_model) self.keys[replace_indices].unsqueeze(0), # (1, update_size, d_model) dim=2 # 输出形状 (B, update_size) ) # 对于每个待替换槽位，选择与之最相似的上下文索引 best_match_indices = cos_sim.argmax(dim=0) return best_match_indices对比上面的版本那个更优

class NeuralDictionary(nn.Module): def init(self, num_keys, d_model, update_stride=100, update_size=5, momentum=0.2, top_k=None, temperature=0.1, topk_context_size=3, use_context_sim=True, use_meta_attention=True, value_update_ratio=0.5, usage_decay=0.9): # 新增参数 super().init() # 参数校验 assert 0 <= value_update_ratio <= 1, "value_update_ratio should be in [0,1]" assert 0 <= usage_decay <= 1, "usage_decay should be in [0,1]" # 配置参数 self.num_keys = num_keys self.d_model = d_model self.update_stride = update_stride self.update_size = update_size self.momentum = momentum self.value_update_ratio = value_update_ratio # values更新比例 self.top_k = top_k self.temperature = temperature self.topk_context_size = topk_context_size self.use_context_sim = use_context_sim self.use_meta_attention = use_meta_attention self.usage_decay = usage_decay # 使用计数器衰减系数 # 内存初始化 self.keys = nn.Parameter(torch.empty(num_keys, d_model)) nn.init.kaiming_normal_(self.keys, mode='fan_in', nonlinearity='relu') # 值内存独立初始化 self.values = nn.Parameter(torch.empty(num_keys, d_model)) nn.init.xavier_normal_(self.values) # 动态状态跟踪 self.register_buffer('usage_counter', torch.zeros(num_keys)) self.register_buffer('update_count', torch.tensor(0)) # 查询偏置 self.query_bias = nn.Parameter(torch.zeros(d_model)) # 元注意力优化：简化结构 if use_meta_attention: # 使用单层参数化注意力 self.meta_attention = nn.Linear(num_keys, num_keys) else: self.meta_attention = None def forward(self, query, context=None): # 维度校验 assert query.dim() == 2, "Input query must be 2D (batch_size, d_model)" # 查询增强 query = query + self.query_bias # (B, d_model) # 高效注意力计算 attn_scores = torch.einsum('bd,kd->bk', query, self.keys) # (B, K) attn_scores /= self.temperature # 上下文相似度增强 if self.use_context_sim and context is not None: assert context.shape == query.shape, "Context must match query shape" # 使用优化的余弦相似度计算 context_sim = F.cosine_similarity( context.unsqueeze(1), # (B, 1, d) self.keys.unsqueeze(0),# (1, K, d) dim=-1 ) # (B, K) attn_scores *= context_sim # Top-K稀疏化 if self.top_k is not None and self.top_k < self.num_keys: topk_scores, topk_indices = torch.topk(attn_scores, self.top_k, dim=-1) attn_scores = torch.full_like(attn_scores, float('-inf')).scatter_(-1, topk_indices, topk_scores) # 元注意力变换 if self.use_meta_attention: attn = F.softmax(self.meta_attention(attn_scores), dim=-1) else: attn = F.softmax(attn_scores, dim=-1) # 内存读取 memory_output = torch.einsum('bk,kd->bd', attn, self.values) # (B, d) # 训练时状态更新 if self.training: self._update_usage(attn) if context is not None: self._update_memory(context) return memory_output

参数包括num_keys（键的数量）、d_model（模型的维度）、update_stride（更新步长）、update_size（更新大小）、momentum（动量）等等。还有一些新增的参数，比如value_update_ratio和usage_decay，需要特别注意它们...

class NeuralDictionary(nn.Module): “”“优化版可微分神经字典，增强稳定性和实用性”“” def init(self, num_keys, d_model, update_stride=100, update_size=5, momentum=0.2): super().init() self.num_keys = num_keys self.d_model = d_model self.update_stride = update_stride self.update_size = update_size self.momentum = momentum # 记忆库初始化 self.keys = nn.Parameter(torch.randn(num_keys, d_model)) self.values = nn.Parameter(torch.randn(num_keys, d_model) * 0.01) # 使用频率记录 self.register_buffer('usage_counter', torch.zeros(num_keys)) self.register_buffer('update_count', torch.tensor(0)) def forward(self, query, context=None): # 1. 记忆检索 attn = torch.matmul(query, self.keys.T) # (B, K) attn = F.softmax(attn, dim=-1) memory_output = torch.matmul(attn, self.values) # (B, d_model) # 2. 仅在训练时更新使用频率 if self.training: with torch.no_grad(): self.usage_counter += attn.sum(dim=0) # 3. 动态更新记忆 if self.training and context is not None: self._update_memory(context) return memory_output def _update_memory(self, new_context): self.update_count += 1 if self.update_count % self.update_stride == 0: with torch.no_grad(): # 选择使用频率最低的槽位 _, replace_indices = torch.topk(self.usage_counter, self.update_size, largest=False) # 使用更智能的采样策略（例如基于相似性） batch_size = new_context.size(0) # 计算当前查询与上下文的相似性，选择与记忆中频繁使用的键最相似的上下文 sample_idx = self._select_best_context(new_context, replace_indices) sampled_context = new_context[sample_idx] # (update_size, d_model) # 向量化更新（动量系数可调） self.keys.data[replace_indices] = (1 - self.momentum) * self.keys.data[replace_indices] + \ self.momentum * sampled_context self.values.data[replace_indices] = (1 - self.momentum) * self.values.data[replace_indices] + \ self.momentum * sampled_context.mean(dim=1, keepdim=True) # 重置计数器 self.usage_counter[replace_indices] = 0 def _select_best_context(self, new_context, replace_indices): # new_context: (B, d_model) # self.keys[replace_indices]: (self.update_size, d_model) # 计算所有 new_context 向量与每个要替换的键的相似度 cos_sim = F.cosine_similarity( new_context.unsqueeze(1), # (B, 1, d_model) self.keys[replace_indices].unsqueeze(0), # (1, self.update_size, d_model) dim=2 # 计算每一对 (B, update_size) 的相似度 ) # 输出大小为 (B, update_size) # 对于每一个要替换的键，找到最相似的上下文 best_match_indices = cos_sim.argmax(dim=0) # 对应于每个 replace_index 的最佳上下文索引 return best_match_indices学习这个版本的代码

def __init__(self, num_keys, d_model, update_stride=100, update_size=5, momentum=0.2): - num_keys：记忆容量（最大存储条目数） - d_model：记忆向量的维度 - update_stride=100：每100次前向传播...

class KeyWordSpotter(torch.nn.Module): def init( self, ckpt_path, config_path, token_path, lexicon_path, threshold, min_frames=5, max_frames=250, interval_frames=50, score_beam=3, path_beam=20, gpu=-1, is_jit_model=False, ): super().init() os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu) with open(config_path, 'r') as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) dataset_conf = configs['dataset_conf'] # feature related self.sample_rate = 16000 self.wave_remained = np.array([]) self.num_mel_bins = dataset_conf['feature_extraction_conf'][ 'num_mel_bins'] self.frame_length = dataset_conf['feature_extraction_conf'][ 'frame_length'] # in ms self.frame_shift = dataset_conf['feature_extraction_conf'][ 'frame_shift'] # in ms self.downsampling = dataset_conf.get('frame_skip', 1) self.resolution = self.frame_shift / 1000 # in second # fsmn splice operation self.context_expansion = dataset_conf.get('context_expansion', False) self.left_context = 0 self.right_context = 0 if self.context_expansion: self.left_context = dataset_conf['context_expansion_conf']['left'] self.right_context = dataset_conf['context_expansion_conf'][ 'right'] self.feature_remained = None self.feats_ctx_offset = 0 # after downsample, offset exist. # model related if is_jit_model: model = torch.jit.load(ckpt_path) # For script model, only cpu is supported. device = torch.device('cpu') else: # Init model from configs model = init_model(configs['model']) load_checkpoint(model, ckpt_path) use_cuda = gpu >= 0 and torch.cuda.is_available() device = torch.device('cuda' if use_cuda else 'cpu') self.device = device self.model = model.to(device) self.model.eval() logging.info(f'model {ckpt_path} loaded.') self.token_table = read_token(token_path) logging.info(f'tokens {token_path} with ' f'{len(self.token_table)} units loaded.') self.lexicon_table = read_lexicon(lexicon_path) logging.info(f'lexicons {lexicon_path} with ' f'{len(self.lexicon_table)} units loaded.') self.in_cache = torch.zeros(0, 0, 0, dtype=torch.float) # decoding and detection related self.score_beam = score_beam self.path_beam = path_beam self.threshold = threshold self.min_frames = min_frames self.max_frames = max_frames self.interval_frames = interval_frames self.cur_hyps = [(tuple(), (1.0, 0.0, []))] self.hit_score = 1.0 self.hit_keyword = None self.activated = False self.total_frames = 0 # frame offset, for absolute time self.last_active_pos = -1 # the last frame of being activated self.result = {} def set_keywords(self, keywords): # 4. parse keywords tokens assert keywords is not None, \ 'at least one keyword is needed, ' \ 'multiple keywords should be splitted with comma(,)' keywords_str = keywords keywords_list = keywords_str.strip().replace(' ', '').split(',') keywords_token = {} keywords_idxset = {0} keywords_strset = {'<blk>'} keywords_tokenmap = {'<blk>': 0} for keyword in keywords_list: strs, indexes = query_token_set(keyword, self.token_table, self.lexicon_table) keywords_token[keyword] = {} keywords_token[keyword]['token_id'] = indexes keywords_token[keyword]['token_str'] = ''.join('%s ' % str(i) for i in indexes) [keywords_strset.add(i) for i in strs] [keywords_idxset.add(i) for i in indexes] for txt, idx in zip(strs, indexes): if keywords_tokenmap.get(txt, None) is None: keywords_tokenmap[txt] = idx token_print = '' for txt, idx in keywords_tokenmap.items(): token_print += f'{txt}({idx}) ' logging.info(f'Token set is: {token_print}') self.keywords_idxset = keywords_idxset self.keywords_token = keywords_token def accept_wave(self, wave): assert isinstance(wave, bytes), \ "please make sure the input format is bytes(raw PCM)" # convert bytes into float32 data = [] for i in range(0, len(wave), 2): value = struct.unpack('<h', wave[i:i + 2])[0] data.append(value) # here we don't divide 32768.0, # because kaldi.fbank accept original input wave = np.array(data) wave = np.append(self.wave_remained, wave) if wave.size < (self.frame_length * self.sample_rate / 1000) \ * self.right_context : self.wave_remained = wave return None wave_tensor = torch.from_numpy(wave).float().to(self.device) wave_tensor = wave_tensor.unsqueeze(0) # add a channel dimension feats = kaldi.fbank(wave_tensor, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift, dither=0, energy_floor=0.0, sample_frequency=self.sample_rate) # update wave remained feat_len = len(feats) frame_shift = int(self.frame_shift / 1000 * self.sample_rate) self.wave_remained = wave[feat_len * frame_shift:] if self.context_expansion: assert feat_len > self.right_context, \ "make sure each chunk feat length is large than right context." # pad feats with remained feature from last chunk if self.feature_remained is None: # first chunk # pad first frame at the beginning, # replicate just support last dimension, so we do transpose. feats_pad = F.pad(feats.T, (self.left_context, 0), mode='replicate').T else: feats_pad = torch.cat((self.feature_remained, feats)) ctx_frm = feats_pad.shape[0] - (self.right_context + self.right_context) ctx_win = (self.left_context + self.right_context + 1) ctx_dim = feats.shape[1] * ctx_win feats_ctx = torch.zeros(ctx_frm, ctx_dim, dtype=torch.float32) for i in range(ctx_frm): feats_ctx[i] = torch.cat(tuple( feats_pad[i:i + ctx_win])).unsqueeze(0) # update feature remained, and feats self.feature_remained = \ feats[-(self.left_context + self.right_context):] feats = feats_ctx.to(self.device) if self.downsampling > 1: last_remainder = 0 if self.feats_ctx_offset == 0 \ else self.downsampling - self.feats_ctx_offset remainder = (feats.size(0) + last_remainder) % self.downsampling feats = feats[self.feats_ctx_offset::self.downsampling, :] self.feats_ctx_offset = remainder \ if remainder == 0 else self.downsampling - remainder return feats def decode_keywords(self, t, probs): absolute_time = t + self.total_frames # search next_hyps depend on current probs and hyps. next_hyps = ctc_prefix_beam_search(absolute_time, probs, self.cur_hyps, self.keywords_idxset, self.score_beam) # update cur_hyps. note: the hyps is sort by path score(pnb+pb), # not the keywords' probabilities. cur_hyps = next_hyps[:self.path_beam] self.cur_hyps = cur_hyps def execute_detection(self, t): absolute_time = t + self.total_frames hit_keyword = None start = 0 end = 0 # hyps for detection hyps = [(y[0], y[1][0] + y[1][1], y[1][2]) for y in self.cur_hyps] # detect keywords in decoding paths. for one_hyp in hyps: prefix_ids = one_hyp[0] # path_score = one_hyp[1] prefix_nodes = one_hyp[2] assert len(prefix_ids) == len(prefix_nodes) for word in self.keywords_token.keys(): lab = self.keywords_token[word]['token_id'] offset = is_sublist(prefix_ids, lab) if offset != -1: hit_keyword = word start = prefix_nodes[offset]['frame'] end = prefix_nodes[offset + len(lab) - 1]['frame'] for idx in range(offset, offset + len(lab)): self.hit_score = prefix_nodes[idx]['prob'] break if hit_keyword is not None: self.hit_score = math.sqrt(self.hit_score) break duration = end - start if hit_keyword is not None: if self.hit_score >= self.threshold and \ self.min_frames <= duration <= self.max_frames \ and (self.last_active_pos == -1 or end - self.last_active_pos >= self.interval_frames): self.activated = True self.last_active_pos = end logging.info( f"Frame {absolute_time} detect {hit_keyword} " f"from {start} to {end} frame. " f"duration {duration}, score {self.hit_score}, Activated.") elif self.last_active_pos > 0 and \ end - self.last_active_pos < self.interval_frames: logging.info( f"Frame {absolute_time} detect {hit_keyword} " f"from {start} to {end} frame. " f"but interval {end-self.last_active_pos} " f"is lower than {self.interval_frames}, Deactivated. ") elif self.hit_score < self.threshold: logging.info(f"Frame {absolute_time} detect {hit_keyword} " f"from {start} to {end} frame. " f"but {self.hit_score} " f"is lower than {self.threshold}, Deactivated. ") elif self.min_frames > duration or duration > self.max_frames: logging.info( f"Frame {absolute_time} detect {hit_keyword} " f"from {start} to {end} frame. " f"but {duration} beyond range" f"({self.min_frames}~{self.max_frames}), Deactivated. ") self.result = { "state": 1 if self.activated else 0, "keyword": hit_keyword if self.activated else None, "start": start self.resolution if self.activated else None, "end": end * self.resolution if self.activated else None, "score": self.hit_score if self.activated else None } def forward(self, wave_chunk): feature = self.accept_wave(wave_chunk) if feature is None or feature.size(0) < 1: return {} # # the feature is not enough to get result. feature = feature.unsqueeze(0) # add a batch dimension logits, self.in_cache = self.model(feature, self.in_cache) probs = logits.softmax(2) # (batch_size, maxlen, vocab_size) probs = probs[0].cpu() # remove batch dimension for (t, prob) in enumerate(probs): t = self.downsampling self.decode_keywords(t, prob) self.execute_detection(t) if self.activated: self.reset() # since a chunk include about 30 frames, # once activated, we can jump the latter frames. # TODO: there should give another method to update result, # avoiding self.result being cleared. break # update frame offset self.total_frames += len(probs) self.downsampling # For streaming kws, the cur_hyps should be reset if the time of # a possible keyword last over the max_frames value you set. # see this issue:https://github.com/duj12/kws_demo/issues/2 if len(self.cur_hyps) > 0 and len(self.cur_hyps[0][0]) > 0: keyword_may_start = int(self.cur_hyps[0][1][2][0]['frame']) if (self.total_frames - keyword_may_start) > self.max_frames: self.reset() return self.result def reset(self): self.cur_hyps = [(tuple(), (1.0, 0.0, []))] self.activated = False self.hit_score = 1.0 def reset_all(self): self.reset() self.wave_remained = np.array([]) self.feature_remained = None self.feats_ctx_offset = 0 # after downsample, offset exist. self.in_cache = torch.zeros(0, 0, 0, dtype=torch.float) self.total_frames = 0 # frame offset, for absolute time self.last_active_pos = -1 # the last frame of being activated self.result = {}请帮我缕清整个脉络

def __init__(self, num_classes=2, input_dim=40, hidden_size=128, num_layers=2): super(KeyWordSpotter, self).__init__() # 卷积部分：用于提取局部特征 self.conv1 = nn.Conv1d(input_dim, 64, kernel_...

这是main.py文件的代码：from datetime import datetime from functools import partial from PIL import Image import cv2 import numpy as np from torch.utils.data import DataLoader from torch.version import cuda from torchvision import transforms from torchvision.datasets import CIFAR10 from torchvision.models import resnet from tqdm import tqdm import argparse import json import math import os import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F #数据增强（核心增强部分） import torch from torchvision import transforms from torch.utils.data import Dataset, DataLoader # 设置参数 parser = argparse.ArgumentParser(description='Train MoCo on CIFAR-10') parser.add_argument('-a', '--arch', default='resnet18') # lr: 0.06 for batch 512 (or 0.03 for batch 256) parser.add_argument('--lr', '--learning-rate', default=0.06, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--epochs', default=300, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--schedule', default=[120, 160], nargs='', type=int, help='learning rate schedule (when to drop lr by 10x); does not take effect if --cos is on') parser.add_argument('--cos', action='store_true', help='use cosine lr schedule') parser.add_argument('--batch-size', default=64, type=int, metavar='N', help='mini-batch size') parser.add_argument('--wd', default=5e-4, type=float, metavar='W', help='weight decay') # moco specific configs: parser.add_argument('--moco-dim', default=128, type=int, help='feature dimension') parser.add_argument('--moco-k', default=4096, type=int, help='queue size; number of negative keys') parser.add_argument('--moco-m', default=0.99, type=float, help='moco momentum of updating key encoder') parser.add_argument('--moco-t', default=0.1, type=float, help='softmax temperature') parser.add_argument('--bn-splits', default=8, type=int, help='simulate multi-gpu behavior of BatchNorm in one gpu; 1 is SyncBatchNorm in multi-gpu') parser.add_argument('--symmetric', action='store_true', help='use a symmetric loss function that backprops to both crops') # knn monitor parser.add_argument('--knn-k', default=20, type=int, help='k in kNN monitor') parser.add_argument('--knn-t', default=0.1, type=float, help='softmax temperature in kNN monitor; could be different with moco-t') # utils parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--results-dir', default='', type=str, metavar='PATH', help='path to cache (default: none)') ''' args = parser.parse_args() # running in command line ''' args = parser.parse_args('') # running in ipynb # set command line arguments here when running in ipynb args.epochs = 300 # 修改处 args.cos = True args.schedule = [] # cos in use args.symmetric = False if args.results_dir == '': args.results_dir = "E:\\contrast\\yolov8\\MoCo\\run\\cache-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S-moco") moco_args = args class CIFAR10Pair(CIFAR10): def getitem(self, index): img = self.data[index] img = Image.fromarray(img) # 原始图像增强 im_1 = self.transform(img) im_2 = self.transform(img) # 退化增强生成额外视图 degraded_results = image_degradation_and_augmentation(img) im_3 = self.transform(Image.fromarray(degraded_results['augmented_images'][0])) # 选择第一组退化增强 im_4 = self.transform(Image.fromarray(degraded_results['cutmix_image'])) return im_1, im_2, im_3, im_4 # 返回原始增强+退化增强 # 定义数据加载器 # class CIFAR10Pair(CIFAR10): # """CIFAR10 Dataset. # """ # def getitem(self, index): # img = self.data[index] # img = Image.fromarray(img) # if self.transform is not None: # im_1 = self.transform(img) # im_2 = self.transform(img) # return im_1, im_2 import cv2 import numpy as np import random def apply_interpolation_degradation(img, method): """ 应用插值退化参数: img: 输入图像(numpy数组) method: 插值方法('nearest', 'bilinear', 'bicubic') 返回: 退化后的图像 """ # 获取图像尺寸 h, w = img.shape[:2] # 应用插值方法 if method == 'nearest': # 最近邻退化: 下采样+上采样 downsampled = cv2.resize(img, (w//2, h//2), interpolation=cv2.INTER_NEAREST) degraded = cv2.resize(downsampled, (w, h), interpolation=cv2.INTER_NEAREST) elif method == 'bilinear': # 双线性退化: 下采样+上采样 downsampled = cv2.resize(img, (w//2, h//2), interpolation=cv2.INTER_LINEAR) degraded = cv2.resize(downsampled, (w, h), interpolation=cv2.INTER_LINEAR) elif method == 'bicubic': # 双三次退化: 下采样+上采样 downsampled = cv2.resize(img, (w//2, h//2), interpolation=cv2.INTER_CUBIC) degraded = cv2.resize(downsampled, (w, h), interpolation=cv2.INTER_CUBIC) else: degraded = img return degraded def darken_image(img, intensity=0.3): """ 应用黑暗处理 - 降低图像亮度并增加暗区对比度参数: img: 输入图像(numpy数组) intensity: 黑暗强度 (0.1-0.9) 返回: 黑暗处理后的图像 """ # 限制强度范围 intensity = max(0.1, min(0.9, intensity)) # 将图像转换为HSV颜色空间 hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV).astype(np.float32) # 降低亮度（V通道） hsv[:, :, 2] = hsv[:, :, 2] intensity # 增加暗区的对比度 - 使用gamma校正 gamma = 1.0 + (1.0 - intensity) # 黑暗强度越大，gamma值越大 hsv[:, :, 2] = np.power(hsv[:, :, 2]/255.0, gamma) * 255.0 # 限制值在0-255范围内 hsv[:, :, 2] = np.clip(hsv[:, :, 2], 0, 255) # 转换回RGB return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB) def random_affine(image): """ 随机仿射变换（缩放和平移）参数: image: 输入图像(numpy数组) 返回: 变换后的图像 """ height, width = image.shape[:2] # 随机缩放因子 (0.8 to 1.2) scale = random.uniform(0.8, 1.2) # 随机平移 (10% of image size) max_trans = 0.1 * min(width, height) tx = random.randint(-int(max_trans), int(max_trans)) ty = random.randint(-int(max_trans), int(max_trans)) # 变换矩阵 M = np.array([[scale, 0, tx], [0, scale, ty]], dtype=np.float32) # 应用仿射变换 transformed = cv2.warpAffine(image, M, (width, height)) return transformed def augment_hsv(image, h_gain=0.1, s_gain=0.5, v_gain=0.5): """ HSV色彩空间增强参数: image: 输入图像(numpy数组) h_gain, s_gain, v_gain: 各通道的增益范围返回: 增强后的图像 """ # 限制增益范围 h_gain = max(-0.1, min(0.1, random.uniform(-h_gain, h_gain))) s_gain = max(0.5, min(1.5, random.uniform(1-s_gain, 1+s_gain))) v_gain = max(0.5, min(1.5, random.uniform(1-v_gain, 1+v_gain))) # 转换为HSV hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV).astype(np.float32) # 应用增益 hsv[:, :, 0] = (hsv[:, :, 0] * (1 + h_gain)) % 180 hsv[:, :, 1] = np.clip(hsv[:, :, 1] * s_gain, 0, 255) hsv[:, :, 2] = np.clip(hsv[:, :, 2] * v_gain, 0, 255) # 转换回RGB return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB) # def mixup(img1, img2, alpha=0.6): # """ # 将两幅图像混合在一起 # 参数: # img1, img2: 输入图像(numpy数组) # alpha: Beta分布的参数，控制混合比例 # 返回: # 混合后的图像 # """ # # 生成混合比例 # lam = random.betavariate(alpha, alpha) # # 确保图像尺寸相同 # if img1.shape != img2.shape: # img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0])) # # 混合图像 # mixed = (lam * img1.astype(np.float32) + (1 - lam) * img2.astype(np.float32)).astype(np.uint8) # return mixed # def image_degradation_and_augmentation(image,dark_intensity=0.3): # """ # 完整的图像退化和增强流程 # 参数: # image: 输入图像(PIL.Image或numpy数组) # 返回: # dict: 包含所有退化组和最终增强结果的字典 # """ # # 确保输入是numpy数组 # if not isinstance(image, np.ndarray): # image = np.array(image) # # 确保图像为RGB格式 # if len(image.shape) == 2: # image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) # elif image.shape[2] == 4: # image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) # # 原始图像 # original = image.copy() # # 插值方法列表 # interpolation_methods = ['nearest', 'bilinear', 'bicubic'] # # 第一组退化: 三种插值方法 # group1 = [] # for method in interpolation_methods: # degraded = apply_interpolation_degradation(original, method) # group1.append(degraded) # # 第二组退化: 随机额外退化 # group2 = [] # for img in group1: # # 随机选择一种退化方法 # method = random.choice(interpolation_methods) # extra_degraded = apply_interpolation_degradation(img, method) # group2.append(extra_degraded) # # 所有退化图像组合 # all_degraded_images = [original] + group1 + group2 # # 应用黑暗处理 (在增强之前) # darkened_images = [darken_image(img, intensity=dark_intensity) for img in all_degraded_images] # # 应用数据增强 # # 1. 随机仿射变换 # affine_images = [random_affine(img) for img in darkened_images] # # 2. HSV增强 # hsv_images = [augment_hsv(img) for img in affine_images] # # 3. MixUp增强 # # 随机选择两个增强后的图像进行混合 # mixed_image = mixup( # random.choice(hsv_images), # random.choice(hsv_images) # ) # # 返回结果 # results = { # 'original': original, # 'degraded_group1': group1, # 第一组退化图像 # 'degraded_group2': group2, # 第二组退化图像 # 'augmented_images': hsv_images, # 所有增强后的图像（原始+六组退化） # 'mixup_image': mixed_image # MixUp混合图像 # } # return results # # def add_gaussian_noise(image, mean=0, sigma=25): # # """添加高斯噪声""" # # noise = np.random.normal(mean, sigma, image.shape) # # noisy = np.clip(image + noise, 0, 255).astype(np.uint8) # # return noisy # # def random_cutout(image, max_holes=3, max_height=16, max_width=16): # # """随机CutOut增强""" # # h, w = image.shape[:2] # # for _ in range(random.randint(1, max_holes)): # # hole_h = random.randint(1, max_height) # # hole_w = random.randint(1, max_width) # # y = random.randint(0, h - hole_h) # # x = random.randint(0, w - hole_w) # # image[y:y+hole_h, x:x+hole_w] = 0 # # return image import cv2 import numpy as np import random from matplotlib import pyplot as plt import pywt def wavelet_degradation(image, level=0.5): """小波系数衰减退化""" # 小波分解 coeffs = pywt.dwt2(image, 'haar') cA, (cH, cV, cD) = coeffs # 衰减高频系数 cH = cH * level cV = cV * level cD = cD * level # 重建图像 return pywt.idwt2((cA, (cH, cV, cD)), 'haar')[:image.shape[0], :image.shape[1]] def adaptive_interpolation_degradation(image): """自适应插值退化（随机选择最近邻或双三次插值）""" if random.choice([True, False]): method = cv2.INTER_NEAREST # 最近邻插值 else: method = cv2.INTER_CUBIC # 双三次插值 # 先缩小再放大 scale_factor = random.uniform(0.3, 0.8) small = cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=method) return cv2.resize(small, (image.shape[1], image.shape[0]), interpolation=method) def bilinear_degradation(image): """双线性插值退化""" # 先缩小再放大 scale_factor = random.uniform(0.3, 0.8) small = cv2.resize(image, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_LINEAR) return cv2.resize(small, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_LINEAR) def cutmix(img1, img2, bboxes1=None, bboxes2=None, beta=1.0): """ 参数: img1: 第一张输入图像(numpy数组) img2: 第二张输入图像(numpy数组) bboxes1: 第一张图像的边界框(可选) bboxes2: 第二张图像的边界框(可选) beta: Beta分布的参数，控制裁剪区域的大小返回: 混合后的图像和边界框(如果有) """ # 确保图像尺寸相同 if img1.shape != img2.shape: img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0])) h, w = img1.shape[:2] # 生成裁剪区域的lambda值(混合比例) lam = np.random.beta(beta, beta) # 计算裁剪区域的宽高 cut_ratio = np.sqrt(1. - lam) cut_w = int(w * cut_ratio) cut_h = int(h * cut_ratio) # 随机确定裁剪区域的中心点 cx = np.random.randint(w) cy = np.random.randint(h) # 计算裁剪区域的边界 x1 = np.clip(cx - cut_w // 2, 0, w) y1 = np.clip(cy - cut_h // 2, 0, h) x2 = np.clip(cx + cut_w // 2, 0, w) y2 = np.clip(cy + cut_h // 2, 0, h) # 执行CutMix操作 mixed_img = img1.copy() mixed_img[y1:y2, x1:x2] = img2[y1:y2, x1:x2] # 计算实际的混合比例 lam = 1 - ((x2 - x1) * (y2 - y1) / (w * h)) # 处理边界框(如果有) mixed_bboxes = None if bboxes1 is not None and bboxes2 is not None: mixed_bboxes = [] # 添加第一张图像的边界框 for bbox in bboxes1: mixed_bboxes.append(bbox + [lam]) # 添加混合权重 # 添加第二张图像的边界框(只添加在裁剪区域内的) for bbox in bboxes2: # 检查边界框是否在裁剪区域内 bbox_x_center = (bbox[0] + bbox[2]) / 2 bbox_y_center = (bbox[1] + bbox[3]) / 2 if (x1 <= bbox_x_center <= x2) and (y1 <= bbox_y_center <= y2): mixed_bboxes.append(bbox + [1 - lam]) return mixed_img, mixed_bboxes def image_degradation_and_augmentation(image, bboxes=None): """ 完整的图像退化和增强流程(修改为使用CutMix) 参数: image: 输入图像(PIL.Image或numpy数组) bboxes: 边界框(可选) 返回: dict: 包含所有退化组和最终增强结果的字典 """ # 确保输入是numpy数组 if not isinstance(image, np.ndarray): image = np.array(image) # 确保图像为RGB格式 if len(image.shape) == 2: image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) elif image.shape[2] == 4: image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB) degraded_sets = [] original = image.copy() # 第一组退化：三种基础退化 degraded_sets.append(wavelet_degradation(original.copy())) degraded_sets.append(degraded_sets) degraded_sets.append(adaptive_interpolation_degradation(original.copy())) degraded_sets.append(degraded_sets) degraded_sets.append(bilinear_degradation(original.copy())) degraded_sets.append(degraded_sets) # # 原始图像 # original = image.copy() # # 插值方法列表 # interpolation_methods = ['nearest', 'bilinear', 'bicubic'] # # 第一组退化: 三种插值方法 # group1 = [] # for method in interpolation_methods: # degraded = apply_interpolation_degradation(original, method) # group1.append(degraded) # 第二组退化: 随机额外退化 # group2 = [] # for img in group1: # # 随机选择一种退化方法 # method = random.choice(interpolation_methods) # extra_degraded = apply_interpolation_degradation(img, method) # group2.append(extra_degraded) # 第二组退化：随机选择再退化 methods = [wavelet_degradation, adaptive_interpolation_degradation, bilinear_degradation] group2=[] for img in degraded_sets: selected_method = random.choice(methods) group2.append(selected_method(img)) group2.append(group2) # 原始图像 original = image.copy() all_degraded_images = [original] + degraded_sets + group2 # 应用黑暗处理 dark_original = darken_image(original) dark_degraded = [darken_image(img) for img in all_degraded_images] # 合并原始和退化图像 all_images = [dark_original] + dark_degraded # 应用数据增强 # 1. 随机仿射变换 affine_images = [random_affine(img) for img in all_images] # 2. HSV增强 hsv_images = [augment_hsv(img) for img in affine_images] # 3. CutMix增强 # 随机选择两个增强后的图像进行混合 mixed_image, mixed_bboxes = cutmix( random.choice(hsv_images), random.choice(hsv_images), bboxes1=bboxes if bboxes is not None else None, bboxes2=bboxes if bboxes is not None else None ) # 返回结果 results = { 'original': original, 'degraded': dark_degraded, 'augmented_images': hsv_images, # 所有增强后的图像（原始+六组退化） 'cutmix_image': mixed_image, # CutMix混合图像 'cutmix_bboxes': mixed_bboxes if bboxes is not None else None # 混合后的边界框 } return results train_transform = transforms.Compose([ transforms.RandomResizedCrop(32), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8), transforms.RandomGrayscale(p=0.2), transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])]) test_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])]) # data_processing prepare train_data = CIFAR10Pair(root="E:/contrast/yolov8/MoCo/data_visdrone2019", train=True, transform=train_transform, download=False) moco_train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=0, pin_memory=True, drop_last=True) memory_data = CIFAR10(root="E:/contrast/yolov8/MoCo/data_visdrone2019", train=True, transform=test_transform, download=False) memory_loader = DataLoader(memory_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) test_data = CIFAR10(root="E:/contrast/yolov8/MoCo/data_visdrone2019", train=False, transform=test_transform, download=False) test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False, num_workers=0, pin_memory=True) # 定义基本编码器 # SplitBatchNorm: simulate multi-gpu behavior of BatchNorm in one gpu by splitting alone the batch dimension # implementation adapted from https://github.com/davidcpage/cifar10-fast/blob/master/torch_backend.py class SplitBatchNorm(nn.BatchNorm2d): def init(self, num_features, num_splits, kw): super().init(num_features, kw) self.num_splits = num_splits def forward(self, input): N, C, H, W = input.shape if self.training or not self.track_running_stats: running_mean_split = self.running_mean.repeat(self.num_splits) running_var_split = self.running_var.repeat(self.num_splits) outcome = nn.functional.batch_norm( input.view(-1, C * self.num_splits, H, W), running_mean_split, running_var_split, self.weight.repeat(self.num_splits), self.bias.repeat(self.num_splits), True, self.momentum, self.eps).view(N, C, H, W) self.running_mean.data.copy_(running_mean_split.view(self.num_splits, C).mean(dim=0)) self.running_var.data.copy_(running_var_split.view(self.num_splits, C).mean(dim=0)) return outcome else: return nn.functional.batch_norm( input, self.running_mean, self.running_var, self.weight, self.bias, False, self.momentum, self.eps) class ModelBase(nn.Module): """ Common CIFAR ResNet recipe. Comparing with ImageNet ResNet recipe, it: (i) replaces conv1 with kernel=3, str=1 (ii) removes pool1 """ def init(self, feature_dim=128, arch=None, bn_splits=16): super(ModelBase, self).init() # use split batchnorm norm_layer = partial(SplitBatchNorm, num_splits=bn_splits) if bn_splits > 1 else nn.BatchNorm2d resnet_arch = getattr(resnet, arch) net = resnet_arch(num_classes=feature_dim, norm_layer=norm_layer) self.net = [] for name, module in net.named_children(): if name == 'conv1': module = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) if isinstance(module, nn.MaxPool2d): continue if isinstance(module, nn.Linear): self.net.append(nn.Flatten(1)) self.net.append(module) self.net = nn.Sequential(self.net) def forward(self, x): x = self.net(x) # note: not normalized here return x # 定义MOCO class ModelMoCo(nn.Module): def init(self, dim=128, K=4096, m=0.99, T=0.1, arch='resnet18', bn_splits=8, symmetric=True): super(ModelMoCo, self).init() self.K = K self.m = m self.T = T self.symmetric = symmetric # create the encoders self.encoder_q = ModelBase(feature_dim=dim, arch=arch, bn_splits=bn_splits) self.encoder_k = ModelBase(feature_dim=dim, arch=arch, bn_splits=bn_splits) for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k.data.copy_(param_q.data) # initialize param_k.requires_grad = False # not update by gradient 不参与训练 # create the queue self.register_buffer("queue", torch.randn(dim, K)) self.queue = nn.functional.normalize(self.queue, dim=0) self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) @torch.no_grad() def _momentum_update_key_encoder(self): # 动量更新encoder_k """ Momentum update of the key encoder """ for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k.data = param_k.data self.m + param_q.data * (1. - self.m) @torch.no_grad() def _dequeue_and_enqueue(self, keys): # 出队与入队 batch_size = keys.shape[0] ptr = int(self.queue_ptr) assert self.K % batch_size == 0 # for simplicity # replace the keys at ptr (dequeue and enqueue) self.queue[:, ptr:ptr + batch_size] = keys.t() # transpose ptr = (ptr + batch_size) % self.K # move pointer self.queue_ptr[0] = ptr @torch.no_grad() def _batch_shuffle_single_gpu(self, x): """ Batch shuffle, for making use of BatchNorm. """ # random shuffle index idx_shuffle = torch.randperm(x.shape[0]).cuda() # index for restoring idx_unshuffle = torch.argsort(idx_shuffle) return x[idx_shuffle], idx_unshuffle @torch.no_grad() def _batch_unshuffle_single_gpu(self, x, idx_unshuffle): """ Undo batch shuffle. """ return x[idx_unshuffle] def contrastive_loss(self, im_q, im_k): # compute query features q = self.encoder_q(im_q) # queries: NxC q = nn.functional.normalize(q, dim=1) # already normalized # compute key features with torch.no_grad(): # no gradient to keys # shuffle for making use of BN im_k_, idx_unshuffle = self._batch_shuffle_single_gpu(im_k) k = self.encoder_k(im_k_) # keys: NxC k = nn.functional.normalize(k, dim=1) # already normalized # undo shuffle k = self._batch_unshuffle_single_gpu(k, idx_unshuffle) # compute logits # Einstein sum is more intuitive # positive logits: Nx1 l_pos = torch.einsum('nc,nc->n', [q, k]).unsqueeze(-1) # negative logits: NxK l_neg = torch.einsum('nc,ck->nk', [q, self.queue.clone().detach()]) # logits: Nx(1+K) logits = torch.cat([l_pos, l_neg], dim=1) # apply temperature logits /= self.T # labels: positive key indicators labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda() loss = nn.CrossEntropyLoss().cuda()(logits, labels) # 交叉熵损失 return loss, q, k def forward(self, im1, im2): """ Input: im_q: a batch of query images im_k: a batch of key images Output: loss """ # update the key encoder with torch.no_grad(): # no gradient to keys self._momentum_update_key_encoder() # compute loss if self.symmetric: # asymmetric loss loss_12, q1, k2 = self.contrastive_loss(im1, im2) loss_21, q2, k1 = self.contrastive_loss(im2, im1) loss = loss_12 + loss_21 k = torch.cat([k1, k2], dim=0) else: # asymmetric loss loss, q, k = self.contrastive_loss(im1, im2) self._dequeue_and_enqueue(k) return loss # create model moco_model = ModelMoCo( dim=args.moco_dim, K=args.moco_k, m=args.moco_m, T=args.moco_t, arch=args.arch, bn_splits=args.bn_splits, symmetric=args.symmetric, ).cuda() # print(moco_model.encoder_q) moco_model_1 = ModelMoCo( dim=args.moco_dim, K=args.moco_k, m=args.moco_m, T=args.moco_t, arch=args.arch, bn_splits=args.bn_splits, symmetric=args.symmetric, ).cuda() # print(moco_model_1.encoder_q) """ CIFAR10 Dataset. """ from torch.cuda import amp scaler = amp.GradScaler(enabled=cuda) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # train for one epoch # def moco_train(net, net_1, data_loader, train_optimizer, epoch, args): # net.train() # adjust_learning_rate(moco_optimizer, epoch, args) # total_loss, total_num, train_bar = 0.0, 0, tqdm(data_loader) # loss_add = 0.0 # for im_1, im_2 in train_bar: # im_1, im_2 = im_1.cuda(non_blocking=True), im_2.cuda(non_blocking=True) # loss = net(im_1, im_2) # 原始图像对比损失梯度清零—>梯度回传—>梯度跟新 # # lossT = loss # 只使用原始对比损失 # # train_optimizer.zero_grad() # # lossT.backward() # # train_optimizer.step() # # loss_add += lossT.item() # # total_num += data_loader.batch_size # # total_loss += loss.item() * data_loader.batch_size # # train_bar.set_description( # # 'Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format( # # epoch, args.epochs, # # train_optimizer.param_groups[0]['lr'], # # loss_add / total_num # # ) # # ) # #傅里叶变换处理流程 # #im_3 = torch.rfft(im_1, 3, onesided=False, normalized=True)[:, :, :, :, 0] # fft_output = torch.fft.fftn(im_1, dim=(-3, -2, -1), norm="ortho")#转换为频域 # real_imag = torch.view_as_real(fft_output)#分解实部虚部 # im_3 = real_imag[..., 0]#提取频域实部作为新视图 # #该处理实现了频域空间的增强，与空间域增强形成了互补 # #im_4 = torch.rfft(im_2, 3, onesided=False, normalized=True)[:, :, :, :, 0] # fft_output = torch.fft.fftn(im_2, dim=(-3, -2, -1), norm="ortho") # real_imag = torch.view_as_real(fft_output) # im_4 = real_imag[..., 0] # loss_1 = net_1(im_3, im_4)#频域特征对比损失 # lossT = 0.8loss + 0.2loss_1#多模态损失对比融合 # train_optimizer.zero_grad() # lossT.backward() # train_optimizer.step() # loss_add += lossT # total_num += data_loader.batch_size # total_loss += loss.item() * data_loader.batch_size # # train_bar.set_description( # # 'Train Epoch: [{}/{}], lr: {:.6f}, Loss: {:.4f}'.format(epoch, args.epochs, moco_optimizer.param_groups[0]['lr'], # # loss_add / total_num)) # return (loss_add / total_num).cpu().item() # yolov5需要的损失 def moco_train(net, net_1, data_loader, train_optimizer, epoch, args): net.train() adjust_learning_rate(train_optimizer, epoch, args) total_loss, total_num = 0.0, 0 train_bar = tqdm(data_loader) for im_1, im_2, im_3, im_4 in train_bar: # 接收4组视图 im_1, im_2 = im_1.cuda(), im_2.cuda() im_3, im_4 = im_3.cuda(), im_4.cuda() # 原始空间域对比损失 loss_orig = net(im_1, im_2) # 退化增强图像的空间域对比损失 loss_degraded = net(im_3, im_4) # 频域处理（对退化增强后的图像） fft_3 = torch.fft.fftn(im_3, dim=(-3, -2, -1), norm="ortho") fft_3 = torch.view_as_real(fft_3)[..., 0] # 取实部 fft_4 = torch.fft.fftn(im_4, dim=(-3, -2, -1), norm="ortho") fft_4 = torch.view_as_real(fft_4)[..., 0] # 频域对比损失 loss_freq = net_1(fft_3, fft_4) # 多模态损失融合 loss = 0.6 * loss_orig + 0.3 * loss_degraded + 0.1 * loss_freq # 反向传播 train_optimizer.zero_grad() loss.backward() train_optimizer.step() # 记录损失 total_num += data_loader.batch_size total_loss += loss.item() # train_bar.set_description(f'Epoch: [{epoch}/{args.epochs}] Loss: {total_loss/total_num:.4f}') return total_loss / total_num # lr scheduler for training def adjust_learning_rate(optimizer, epoch, args): # 学习率衰减 """Decay the learning rate based on schedule""" lr = args.lr if args.cos: # cosine lr schedule lr = 0.5 (1. + math.cos(math.pi * epoch / args.epochs)) else: # stepwise lr schedule for milestone in args.schedule: lr = 0.1 if epoch >= milestone else 1. for param_group in optimizer.param_groups: param_group['lr'] = lr # test using a knn monitor def test(net, memory_data_loader, test_data_loader, epoch, args): net.eval() classes = len(memory_data_loader.dataset.classes) total_top1, total_top5, total_num, feature_bank = 0.0, 0.0, 0, [] with torch.no_grad(): # generate feature bank for data, target in tqdm(memory_data_loader, desc='Feature extracting'): feature = net(data.cuda(non_blocking=True)) feature = F.normalize(feature, dim=1) feature_bank.append(feature) # [D, N] feature_bank = torch.cat(feature_bank, dim=0).t().contiguous() # [N] feature_labels = torch.tensor(memory_data_loader.dataset.targets, device=feature_bank.device) # loop test data_processing to predict the label by weighted knn search test_bar = tqdm(test_data_loader) for data, target in test_bar: data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True) feature = net(data) feature = F.normalize(feature, dim=1) pred_labels = knn_predict(feature, feature_bank, feature_labels, classes, args.knn_k, args.knn_t) total_num += data.size(0) total_top1 += (pred_labels[:, 0] == target).float().sum().item() test_bar.set_description( 'Test Epoch: [{}/{}] Acc@1:{:.2f}%'.format(epoch, args.epochs, total_top1 / total_num 100)) return total_top1 / total_num * 100 # knn monitor as in InstDisc https://arxiv.org/abs/1805.01978 # implementation follows http://github.com/zhirongw/lemniscate.pytorch and https://github.com/leftthomas/SimCLR def knn_predict(feature, feature_bank, feature_labels, classes, knn_k, knn_t): # compute cos similarity between each feature vector and feature bank ---> [B, N] sim_matrix = torch.mm(feature, feature_bank) # [B, K] sim_weight, sim_indices = sim_matrix.topk(k=knn_k, dim=-1) # [B, K] sim_labels = torch.gather(feature_labels.expand(feature.size(0), -1), dim=-1, index=sim_indices) sim_weight = (sim_weight / knn_t).exp() # counts for each class one_hot_label = torch.zeros(feature.size(0) * knn_k, classes, device=sim_labels.device) # [BK, C] one_hot_label = one_hot_label.scatter(dim=-1, index=sim_labels.view(-1, 1), value=1.0) # weighted score ---> [B, C] pred_scores = torch.sum(one_hot_label.view(feature.size(0), -1, classes) sim_weight.unsqueeze(dim=-1), dim=1) pred_labels = pred_scores.argsort(dim=-1, descending=True) return pred_labels # 开始训练 # define optimizer moco_optimizer = torch.optim.SGD(moco_model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=0.9) 上述问题怎么修改？

class ResNetSmall(nn.Module): def __init__(self, base_model='resnet18'): super().__init__() self.encoder = torchvision.models.__dict__[base_model]() self.encoder.conv1 = nn.Conv2d(3, 64, kernel_...

import h5py import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset import numpy as np import matplotlib.pyplot as plt # ------------------------- 1. 数据加载（h5py） ------------------------- def load_hdf5_data(file_path): with h5py.File(file_path, 'r') as f: # 检查HDF5文件结构 print("HDF5文件结构:", list(f.keys())) # 确认变量路径（根据实际保存的键名调整） X = np.array(f['X']) # 维度: (num_samples, 64, 2) y = np.array(f['y']) # 维度: (num_samples, 128) # 转换为PyTorch张量并调整维度顺序 X_tensor = torch.from_numpy(X.astype(np.float32)).permute(2, 0, 1,) # (batch, 2, 64) y_tensor = torch.from_numpy(y.astype(np.float32)).permute((1,0)) return X_tensor, y_tensor # ------------------------- 2. 模型定义 ------------------------- class JointEstimationModel(nn.Module): def init(self, input_dim=64, output_dim=128): super().init() self.conv1d = nn.Conv1d(in_channels=2, out_channels=64, kernel_size=3, padding=1) self.lstm = nn.LSTM(input_size=64, hidden_size=128, batch_first=True) self.fc1 = nn.Linear(64 * 128, 256) self.fc2 = nn.Linear(256, output_dim) self.sigmoid = nn.Sigmoid() def forward(self, x): x = torch.relu(self.conv1d(x)) # 输出: (batch, 64, 64) x = x.permute(0, 2, 1) # 调整维度: (batch, 64, 64) x, _ = self.lstm(x) # 输出: (batch, 64, 128) x = x.reshape(x.size(0), -1) # 展平: (batch, 64*128) x = torch.relu(self.fc1(x)) x = self.fc2(x) return self.sigmoid(x) # ------------------------- 3. 训练函数 ------------------------- def train_model(train_loader, model, criterion, optimizer, device): model.train() total_loss = 0.0 for inputs, labels in train_loader: inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step()

self.lstm = nn.LSTM(input_size, hidden_size, num_layers, dropout=0.2) --- ### 二、训练过程优化 1. **梯度管理** - 实施梯度裁剪防止爆炸： python torch.nn.utils.clip_grad_norm_(model....

【性能优化】：PyTorch CNN训练速度提升的五大策略

[【性能优化】：PyTorch CNN训练速度提升的五大策略](https://opengraph.githubassets.com/890bb0e38562548c3a0cb18b11a079223a9c4bdcec3ae601d0e60b0d122eadaa/SforAiDl/KD_Lib) # 1. PyTorch CNN训练速度问题概述 ...

深度学习与数据增强：医学图像分割中的关键技术和工具

!... # 摘要随着深度学习技术的发展，医学图像分割在医疗诊断中的应用越来越广泛。本文旨在提供深度学习在医学图像分割领域中应用的全面综述。首先，介绍了深度学习和医学图像分割的基础知识。随后，深入探讨了数据...

【性能优化实战】：如何显著提升RegSeg网络的速度与精度

![【性能优化实战】：如何显著提升RegSeg网络的速度与...# 1. RegSeg网络概述 RegSeg网络是一种深度学习模型，主要用于处理图像分割问题，它将图像中的每个像素分类，以识别不同的对象或区域。该网络在自动驾驶、医

交通手势识别系统的优化之路：掌握模型优化的黄金法则

!... # 摘要交通手势识别系统对于提升道路安全和管理有着重要意义。本文全面概述了交通手势识别系统，包括深度学习在该领域中的应用和模型优化的理论基础。通过深入探讨模型优化的目标与方法，如减少过拟合、数据增强...

【YOLOv1基础】训练技巧：针对YOLOv1的训练过程中的要点

!...# 1. YOLOv1概述与理论基础 YOLOv1（You Only Look Once version 1）是一种在2016年首次提出的实时目标检测系统。它将目标检测任务视为一个回归问题，直接在图像中预测边界框和类别概率。YOLOv1通过将输入图像划分...

C#类库封装：简化SDK调用实现多功能集成，构建地磅无人值守系统

内容概要：本文介绍了利用C#类库封装多个硬件设备的SDK接口，实现一系列复杂功能的一键式调用。具体功能包括身份证信息读取、人证识别、车牌识别（支持臻识和海康摄像头）、LED显示屏文字输出、称重数据读取、二维码扫描以及语音播报。所有功能均被封装为简单的API，极大降低了开发者的工作量和技术门槛。文中详细展示了各个功能的具体实现方式及其应用场景，如身份证读取、人证核验、车牌识别等，并最终将这些功能整合到一起，形成了一套完整的地磅称重无人值守系统解决方案。适合人群：具有一定C#编程经验的技术人员，尤其是需要快速集成多种硬件设备SDK的应用开发者。使用场景及目标：适用于需要高效集成多种硬件设备SDK的项目，特别是那些涉及身份验证、车辆管理、物流仓储等领域的企业级应用。通过使用这些封装好的API，可以大大缩短开发周期，降低维护成本，提高系统的稳定性和易用性。其他说明：虽然封装后的API极大地简化了开发流程，但对于一些特殊的业务需求，仍然可能需要深入研究底层SDK。此外，在实际部署过程中，还需考虑网络环境、硬件兼容性等因素的影响。

基于STM32F1的BLDC无刷直流电机与PMSM永磁同步电机源码解析：传感器与无传感器驱动详解

基于STM32F1的BLDC无刷直流电机和PMSM永磁同步电机的驱动实现方法，涵盖了有传感器和无传感两种驱动方式。对于BLDC电机，有传感器部分采用霍尔传感器进行六步换相，无传感部分则利用反电动势过零点检测实现换相。对于PMSM电机，有传感器部分包括霍尔传感器和编码器的方式，无传感部分则采用了滑模观测器进行矢量控制（FOC）。文中不仅提供了详细的代码片段，还分享了许多调试经验和技巧。适合人群：具有一定嵌入式系统和电机控制基础知识的研发人员和技术爱好者。使用场景及目标：适用于需要深入了解和实现BLDC和PMSM电机驱动的开发者，帮助他们掌握不同传感器条件下的电机控制技术和优化方法。其他说明：文章强调了实际调试过程中可能遇到的问题及其解决方案，如霍尔传感器的中断触发换相、反电动势过零点检测的采样时机、滑模观测器的参数调整以及编码器的ABZ解码等。

基于Java的跨平台图像处理软件ImageJ：多功能图像编辑与分析工具

内容概要：本文介绍了基于Java的图像处理软件ImageJ，详细阐述了它的跨平台特性、多线程处理能力及其丰富的图像处理功能。ImageJ由美国国立卫生研究院开发，能够在多种操作系统上运行，包括Windows、Mac OS、Linux等。它支持多种图像格式，如TIFF、PNG、GIF、JPEG、BMP、DICOM、FITS等，并提供图像栈功能，允许多个图像在同一窗口中进行并行处理。此外，ImageJ还提供了诸如缩放、旋转、扭曲、平滑处理等基本操作，以及区域和像素统计、间距、角度计算等高级功能。这些特性使ImageJ成为科研、医学、生物等多个领域的理想选择。适合人群：需要进行图像处理的专业人士，如科研人员、医生、生物学家，以及对图像处理感兴趣的普通用户。使用场景及目标：适用于需要高效处理大量图像数据的场合，特别是在科研、医学、生物学等领域。用户可以通过ImageJ进行图像的编辑、分析、处理和保存，提高工作效率。其他说明：ImageJ不仅功能强大，而且操作简单，用户无需安装额外的运行环境即可直接使用。其基于Java的开发方式确保了不同操作系统之间的兼容性和一致性。

MATLAB语音识别系统：基于GUI的数字0-9识别及深度学习模型应用 · GUI v1.2

内容概要：本文介绍了一款基于MATLAB的语音识别系统，主要功能是识别数字0到9。该系统采用图形用户界面（GUI），方便用户操作，并配有详尽的代码注释和开发报告。文中详细描述了系统的各个组成部分，包括音频采集、信号处理、特征提取、模型训练和预测等关键环节。此外，还讨论了MATLAB在此项目中的优势及其面临的挑战，如提高识别率和处理背景噪音等问题。最后，通过对各模块的工作原理和技术细节的总结，为未来的研究和发展提供了宝贵的参考资料。适合人群：对语音识别技术和MATLAB感兴趣的初学者、学生或研究人员。使用场景及目标：适用于希望深入了解语音识别技术原理的人群，特别是希望通过实际案例掌握MATLAB编程技巧的学习者。目标是在实践中学习如何构建简单的语音识别应用程序。其他说明：该程序需要MATLAB 2019b及以上版本才能正常运行，建议使用者确保软件环境符合要求。

c语言通讯录管理系统源码.zip

C语言项目源码

基于MATLAB的电力与天然气市场出清及多方博弈行为分析

内容概要：本文探讨了利用MATLAB进行电力市场和天然气市场出清的研究，重点分析了多方博弈行为对能源价格和市场出清的影响。文中首先介绍了简单的电力市场出清问题，通过最小化成本来满足负荷需求，并展示了具体的MATLAB代码实现。接着讨论了发电厂之间的博弈行为，特别是如何通过纳什均衡求解来最大化各自的利润。对于天然气市场，文章提到了管网压力约束带来的复杂性以及如何通过半定松弛技巧将非凸优化问题转化为可处理的形式。最后，文章还涉及了电-气市场耦合情况下的价格传导效应，展示了如何使用符号计算工具箱进行交叉价格弹性的计算。适合人群：对电力市场、天然气市场及能源经济学感兴趣的科研人员和技术开发者。使用场景及目标：适用于希望深入了解电力和天然气市场运作机制的人群，尤其是那些希望通过数学建模和仿真手段研究市场动态变化的专业人士。目标是帮助读者掌握如何使用MATLAB进行复杂的市场出清模拟和博弈分析。其他说明：文章不仅提供了理论背景介绍，还包括了大量的MATLAB代码实例，便于读者理解和实践。此外，文章强调了现实中市场出清问题的复杂性和多变性，鼓励读者结合实际情况灵活应用所学知识。

c语言学生信息系统.zip

C语言项目源码

相关推荐

TensorFlow tf.nn.max_pool实现池化操作方式

fft.java.rar_RScode _fft.java_java f_java fft jar_java fft strid

Tensorflow tf.nn.atrous_conv2d如何实现空洞卷积的

【性能优化】：PyTorch CNN训练速度提升的五大策略

深度学习与数据增强：医学图像分割中的关键技术和工具

【性能优化实战】：如何显著提升RegSeg网络的速度与精度

交通手势识别系统的优化之路：掌握模型优化的黄金法则

【YOLOv1基础】训练技巧：针对YOLOv1的训练过程中的要点

C#类库封装：简化SDK调用实现多功能集成，构建地磅无人值守系统

基于STM32F1的BLDC无刷直流电机与PMSM永磁同步电机源码解析：传感器与无传感器驱动详解

基于Java的跨平台图像处理软件ImageJ：多功能图像编辑与分析工具

MATLAB语音识别系统：基于GUI的数字0-9识别及深度学习模型应用 · GUI v1.2

c语言通讯录管理系统源码.zip

基于MATLAB的电力与天然气市场出清及多方博弈行为分析

c语言学生信息系统.zip

大家在看

ELEC5208 Group project submissions.zip_furniturer4m_smart grid_悉

基于python单通道脑电信号的自动睡眠分期研究

bid格式文件电子标书阅读器.zip

机器翻译WMT14数据集

高通QXDM使用手册.pdf

最新推荐

Tensorflow tf.nn.atrous_conv2d如何实现空洞卷积的

C#类库封装：简化SDK调用实现多功能集成，构建地磅无人值守系统

Teleport Pro教程：轻松复制网站内容

【跨平台开发者的必读】：解决Qt5Widgetsd.lib目标计算机类型冲突终极指南

普通RNN结构和特点

探讨通用数据连接池的核心机制与应用

【LabVIEW网络通讯终极指南】：7个技巧提升UDP性能和安全性

简要介绍cnn卷积神经网络

基于ASP的深度学习网站导航系统功能详解

【Oracle数据泵进阶技巧】：避免ORA-31634和ORA-31664错误的终极策略