tf.image.resize_bilinear

Image_Resize

# -- coding: utf-8 -- """ Created on Thu Apr 25 16:05:29 2024 @author: lich5 """ import numpy as np # linear algebra import tensorflow as tf # from tensorflow import keras import matplotlib.pyplot as plt from tensorflow.keras import layers, models, Model, Sequential, datasets from tensorflow.keras.layers import MaxPool2D # Input data files are available in the read-only "../input/" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory # import os # for dirname, _, filenames in os.walk('/kaggle/input'): # for filename in filenames: # print(os.path.join(dirname, filename)) class Inception(tf.keras.Model): # c1--c4是每条路径的输出通道数 def init(self, ch1x1, ch3x3, ch5x5, pool_proj): super().init() # 线路1，单1x1卷积层 self.p1_1 = layers.Conv2D(ch1x1, 1, activation='relu') # 线路2，1x1卷积层后接3x3卷积层 self.p2_1 = layers.Conv2D(ch3x3[0], 1, activation='relu') self.p2_2 = layers.Conv2D(ch3x3[1], 3, padding='same', activation='relu') # 线路3，1x1卷积层后接5x5卷积层 self.p3_1 = layers.Conv2D(ch5x5[0], 1, activation='relu') self.p3_2 = layers.Conv2D(ch5x5[1], 5, padding='same', activation='relu') # 线路4，3x3最大汇聚层后接1x1卷积层 self.p4_1 = layers.MaxPool2D(3, 1, padding='same') self.p4_2 = layers.Conv2D(pool_proj, 1, activation='relu') def call(self, x): p1 = self.p1_1(x) p2 = self.p2_2(self.p2_1(x)) p3 = self.p3_2(self.p3_1(x)) p4 = self.p4_2(self.p4_1(x)) # 在通道维度上连结输出 return layers.Concatenate()([p1, p2, p3, p4]) class InceptionAux(tf.keras.Model): def init(self, num_classes): super().init() self.averagePool = layers.AvgPool2D(pool_size=5, strides=3) self.conv = layers.Conv2D(128, kernel_size=1, activation="relu") self.fc1 = layers.Dense(1024, activation="relu") self.fc2 = layers.Dense(num_classes) self.softmax = layers.Softmax() def call(self, x): # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14 x = self.averagePool(x) # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4 x = self.conv(x) # N x 128 x 4 x 4 x = layers.Flatten()(x) x = layers.Dropout(rate=0.5)(x) # N x 2048 x = self.fc1(x) x = layers.Dropout(rate=0.5)(x) # N x 1024 x = self.fc2(x) # N x num_classes x = self.softmax(x) return x # class GoogLeNet(im_height=224, im_width=224, class_num=1000, aux_logits=False): # # tensorflow中的tensor通道排序是NHWC # input_image = layers.Input(shape=(im_height, im_width, 3), dtype="float32") # # def b1: # # (None, 224, 224, 3) # x = layers.Conv2D(64, kernel_size=7, strides=2, padding="SAME", activation="relu")(input_image) # # (None, 112, 112, 64) # x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME")(x) # # def b2: # # (None, 56, 56, 64) # x = layers.Conv2D(64, kernel_size=1, activation="relu")(x) # # (None, 56, 56, 64) # x = layers.Conv2D(192, kernel_size=3, padding="SAME", activation="relu")(x) # # (None, 56, 56, 192) # x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME")(x) # # def b3: # # (None, 28, 28, 192) # x = Inception(64, (96, 128), (16, 32), 32)(x) # # (None, 28, 28, 256) # x = Inception(128, (128, 192), (32, 96), 64)(x) # # (None, 28, 28, 480) # x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME")(x) # # (None, 14, 14, 480) # # def b4: # x = Inception(192, (96, 208), (16, 48), 64)(x) # if aux_logits: # aux1 = InceptionAux(class_num)(x) # # (None, 14, 14, 512) # x = Inception(160, (112, 224), (24, 64), 64)(x) # # (None, 14, 14, 512) # x = Inception(128, (128, 256), (24, 64), 64)(x) # # (None, 14, 14, 512) # x = Inception(112, (144, 288), (32, 64), 64)(x) # if aux_logits: # aux2 = InceptionAux(class_num)(x) # # # def b5: # # (None, 14, 14, 528) # x = Inception(256, (160, 320), (32, 128), 128)(x) # # (None, 14, 14, 532) # x = Inception(384, (192, 384), (48, 128), 128)(x) # # (None, 7, 7, 1024) # x = layers.GlobalAvgPool2D()(x) # # (None, 1, 1, 1024) # x = layers.Flatten()(x) # x = layers.Dense(class_num)(x) # # (None, class_num) # aux3 = layers.Softmax(x) # if aux_logits: # model = models.Model(inputs=input_image, outputs=[aux1, aux2, aux3]) # else: # model = models.Model(inputs=input_image, outputs=aux3) # return model if name == 'main': #%% load and preprocess data (train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data() train_ds=tf.data.Dataset.from_tensor_slices((train_images,train_labels)) test_ds=tf.data.Dataset.from_tensor_slices((test_images,test_labels)) CLASS_NAMES= ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'] # plt.figure(figsize=(30,30)) # for i,(image,label) in enumerate(train_ds.shuffle(100000).take(20)): # #print(label) # ax=plt.subplot(5,5,i+1) # plt.imshow(image) # plt.title(CLASS_NAMES[label.numpy()[0]]) # plt.axis('off') def process_image(image,label): if len(image.shape) == 2: # 检查是否为二维图像 image = tf.expand_dims(image, axis=-1) # 添加通道维度 image=tf.image.per_image_standardization(image) image=tf.image.resize(image, (32,32), method=tf.image.ResizeMethod.BILINEAR) return image,label train_ds_size=tf.data.experimental.cardinality(train_ds).numpy() test_ds_size=tf.data.experimental.cardinality(test_ds).numpy() train_ds=(train_ds .map(process_image) .shuffle(buffer_size=train_ds_size) .batch(batch_size=128,drop_remainder=True) ) test_ds=(test_ds .map(process_image) .shuffle(buffer_size=test_ds_size) .batch(batch_size=128,drop_remainder=True) ) #%% define the model im_height = 96 im_width = 96 batch_size = 128 epochs = 15 # model = GoogLeNet(im_height=im_height, im_width=im_width, class_num=10, aux_logits=True) model = tf.keras.Sequential() # def b1: model.add(layers.Conv2D(64, 7, strides=2, padding='same', activation='relu')) model.add(layers.MaxPool2D(pool_size=3, strides=2, padding='same')) # def b2: model.add(layers.Conv2D(64, 1, activation='relu')) model.add(layers.Conv2D(192, 3, padding='same', activation='relu')) model.add(layers.MaxPool2D(pool_size=3, strides=2, padding='same')) # def b3: model.add(Inception(64, (96, 128), (16, 32), 32)) model.add(Inception(128, (128, 192), (32, 96), 64)) model.add(layers.MaxPool2D(pool_size=3, strides=2, padding='same')) # def b4: model.add(Inception(192, (96, 208), (16, 48), 64)) model.add(Inception(160, (112, 224), (24, 64), 64)) model.add(Inception(128, (128, 256), (24, 64), 64)) model.add(Inception(112, (144, 288), (32, 64), 64)) model.add(Inception(256, (160, 320), (32, 128), 128)) model.add(layers.MaxPool2D(pool_size=3, strides=2, padding='same')) # def b5: model.add(Inception(256, (160, 320), (32, 128), 128)) model.add(Inception(384, (192, 384), (48, 128), 128)) model.add(layers.GlobalAvgPool2D()) model.add(layers.Flatten()) # def FC model.add(layers.Dense(10)) model.compile( loss='sparse_categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=0.0005), metrics=['accuracy'] ) # model.build((batch_size, 224, 224, 3)) # when using subclass model # model.summary() history=model.fit( train_ds, epochs=epochs, #50 validation_data=test_ds ) # # 保存模型 # model.save('cnn_model.h5') # # 加载模型 # model = tf.keras.models.load_model('cnn_model.h5') model.evaluate(test_ds, verbose=2) idx = np.random.randint(1e4,size=9) images = test_images[idx,:] y_ = test_labels[idx] # 测试模型 def plot_cifar10_3_3(images, y_, y=None): assert images.shape[0] == len(y_) fig, axes = plt.subplots(3, 3) for i, ax in enumerate(axes.flat): ax.imshow(images[i], cmap='binary') if y is None: xlabel = 'True: {}'.format(CLASS_NAMES[y_[i][0]]) else: xlabel = 'True: {0}, Pred: {1}'.format(CLASS_NAMES[y_[i][0]], CLASS_NAMES[y[i]]) ax.set_xlabel(xlabel) ax.set_xticks([]) ax.set_yticks([]) plt.show() '''利用predict命令，输入x_test生成测试样本的测试值''' predictions = model.predict(images) y_pred = np.argmax(predictions, axis = 1) plot_cifar10_3_3(images, y_, y_pred) f,ax=plt.subplots(2,1,figsize=(10,10)) #Assigning the first subplot to graph training loss and validation loss ax[0].plot(history.history['loss'],color='b',label='Training Loss') ax[0].plot(history.history['val_loss'],color='r',label='Validation Loss') #Plotting the training accuracy and validation accuracy ax[1].plot(history.history['accuracy'],color='b',label='Training Accuracy') ax[1].plot(history.history['val_accuracy'],color='r',label='Validation Accuracy') plt.legend() # [EOF]修改一下

from tensorflow.keras.preprocessing.image import ImageDataGenerator # 数据增强 train_datagen = ImageDataGenerator( rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, shear_range=...

from libs.PipeLine import PipeLine, ScopedTiming from libs.AIBase import AIBase from libs.AI2D import Ai2d import os import ujson from media.media import * from time import * import nncase_runtime as nn import ulab.numpy as np import time import image import aidemo import random import gc import sys # 自定义人脸检测任务类 class FaceDetApp(AIBase): def init(self,kmodel_path,model_input_size,anchors,confidence_threshold=0.25,nms_threshold=0.3,rgb888p_size=[1280,720],display_size=[1920,1080],debug_mode=0): super().init(kmodel_path,model_input_size,rgb888p_size,debug_mode) # kmodel路径 self.kmodel_path=kmodel_path # 检测模型输入分辨率 self.model_input_size=model_input_size # 置信度阈值 self.confidence_threshold=confidence_threshold # nms阈值 self.nms_threshold=nms_threshold self.anchors=anchors # sensor给到AI的图像分辨率，宽16字节对齐 self.rgb888p_size=[ALIGN_UP(rgb888p_size[0],16),rgb888p_size[1]] # 视频输出VO分辨率，宽16字节对齐 self.display_size=[ALIGN_UP(display_size[0],16),display_size[1]] # debug模式 self.debug_mode=debug_mode # 实例化Ai2d，用于实现模型预处理 self.ai2d=Ai2d(debug_mode) # 设置Ai2d的输入输出格式和类型 self.ai2d.set_ai2d_dtype(nn.ai2d_format.NCHW_FMT,nn.ai2d_format.NCHW_FMT,np.uint8, np.uint8) # 配置预处理操作，这里使用了pad和resize，Ai2d支持crop/shift/pad/resize/affine，具体代码请打开/sdcard/app/libs/AI2D.py查看 def config_preprocess(self,input_image_size=None): with ScopedTiming("set preprocess config",self.debug_mode > 0): # 初始化ai2d预处理配置，默认为sensor给到AI的尺寸，可以通过设置input_image_size自行修改输入尺寸 ai2d_input_size=input_image_size if input_image_size else self.rgb888p_size # 计算padding参数，并设置padding预处理 self.ai2d.pad(self.get_pad_param(), 0, [104,117,123]) # 设置resize预处理 self.ai2d.resize(nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) # 构建预处理流程,参数为预处理输入tensor的shape和预处理输出的tensor的shape self.ai2d.build([1,3,ai2d_input_size[1],ai2d_input_size[0]],[1,3,self.model_input_size[1],self.model_input_size[0]]) # 自定义后处理，results是模型输出的array列表，这里调用了aidemo库的face_det_post_process接口 def postprocess(self,results): with ScopedTiming("postprocess",self.debug_mode > 0): res = aidemo.face_det_post_process(self.confidence_threshold,self.nms_threshold,self.model_input_size[0],self.anchors,self.rgb888p_size,results) if len(res)==0: return res else: return res[0] # 计算padding参数 def get_pad_param(self): dst_w = self.model_input_size[0] dst_h = self.model_input_size[1] # 计算最小的缩放比例，等比例缩放 ratio_w = dst_w / self.rgb888p_size[0] ratio_h = dst_h / self.rgb888p_size[1] if ratio_w < ratio_h: ratio = ratio_w else: ratio = ratio_h new_w = (int)(ratio * self.rgb888p_size[0]) new_h = (int)(ratio * self.rgb888p_size[1]) dw = (dst_w - new_w) / 2 dh = (dst_h - new_h) / 2 top = (int)(round(0)) bottom = (int)(round(dh * 2 + 0.1)) left = (int)(round(0)) right = (int)(round(dw * 2 - 0.1)) return [0,0,0,0,top, bottom, left, right] # 自定义人脸解析任务类 class FaceParseApp(AIBase): def init(self,kmodel_path,model_input_size,rgb888p_size=[1920,1080],display_size=[1920,1080],debug_mode=0): super().init(kmodel_path,model_input_size,rgb888p_size,debug_mode) # kmodel路径 self.kmodel_path=kmodel_path # 检测模型输入分辨率 self.model_input_size=model_input_size # sensor给到AI的图像分辨率，宽16字节对齐 self.rgb888p_size=[ALIGN_UP(rgb888p_size[0],16),rgb888p_size[1]] # 视频输出VO分辨率，宽16字节对齐 self.display_size=[ALIGN_UP(display_size[0],16),display_size[1]] # debug模式 self.debug_mode=debug_mode # 实例化Ai2d，用于实现模型预处理 self.ai2d=Ai2d(debug_mode) # 设置Ai2d的输入输出格式和类型 self.ai2d.set_ai2d_dtype(nn.ai2d_format.NCHW_FMT,nn.ai2d_format.NCHW_FMT,np.uint8, np.uint8) # 配置预处理操作，这里使用了affine，Ai2d支持crop/shift/pad/resize/affine，具体代码请打开/sdcard/app/libs/AI2D.py查看 def config_preprocess(self,det,input_image_size=None): with ScopedTiming("set preprocess config",self.debug_mode > 0): # 初始化ai2d预处理配置，默认为sensor给到AI的尺寸，可以通过设置input_image_size自行修改输入尺寸 ai2d_input_size=input_image_size if input_image_size else self.rgb888p_size # 计算仿射变换矩阵并设置affine预处理 matrix_dst = self.get_affine_matrix(det) self.ai2d.affine(nn.interp_method.cv2_bilinear,0, 0, 127, 1,matrix_dst) # 构建预处理流程,参数为预处理输入tensor的shape和预处理输出的tensor的shape self.ai2d.build([1,3,ai2d_input_size[1],ai2d_input_size[0]],[1,3,self.model_input_size[1],self.model_input_size[0]]) # 自定义后处理，results是模型输出的array列表，这里将第一个输出返回 def postprocess(self,results): with ScopedTiming("postprocess",self.debug_mode > 0): return results[0] def get_affine_matrix(self,bbox): # 获取仿射矩阵，用于将边界框映射到模型输入空间 with ScopedTiming("get_affine_matrix", self.debug_mode > 1): # 设置缩放因子 factor = 2.7 # 从边界框提取坐标和尺寸 x1, y1, w, h = map(lambda x: int(round(x, 0)), bbox[:4]) # 模型输入大小 edge_size = self.model_input_size[1] # 平移距离，使得模型输入空间的中心对准原点 trans_distance = edge_size / 2.0 # 计算边界框中心点的坐标 center_x = x1 + w / 2.0 center_y = y1 + h / 2.0 # 计算最大边长 maximum_edge = factor * (h if h > w else w) # 计算缩放比例 scale = edge_size * 2.0 / maximum_edge # 计算平移参数 cx = trans_distance - scale * center_x cy = trans_distance - scale * center_y # 创建仿射矩阵 affine_matrix = [scale, 0, cx, 0, scale, cy] return affine_matrix # 人脸解析任务 class FaceParse: def init(self,face_det_kmodel,face_parse_kmodel,det_input_size,parse_input_size,anchors,confidence_threshold=0.25,nms_threshold=0.3,rgb888p_size=[1920,1080],display_size=[1920,1080],debug_mode=0): # 人脸检测模型路径 self.face_det_kmodel=face_det_kmodel # 人脸解析模型路径 self.face_pose_kmodel=face_parse_kmodel # 人脸检测模型输入分辨率 self.det_input_size=det_input_size # 人脸解析模型输入分辨率 self.parse_input_size=parse_input_size # anchors self.anchors=anchors # 置信度阈值 self.confidence_threshold=confidence_threshold # nms阈值 self.nms_threshold=nms_threshold # sensor给到AI的图像分辨率，宽16字节对齐 self.rgb888p_size=[ALIGN_UP(rgb888p_size[0],16),rgb888p_size[1]] # 视频输出VO分辨率，宽16字节对齐 self.display_size=[ALIGN_UP(display_size[0],16),display_size[1]] # debug_mode模式 self.debug_mode=debug_mode # 人脸检测任务类实例 self.face_det=FaceDetApp(self.face_det_kmodel,model_input_size=self.det_input_size,anchors=self.anchors,confidence_threshold=self.confidence_threshold,nms_threshold=self.nms_threshold,rgb888p_size=self.rgb888p_size,display_size=self.display_size,debug_mode=0) # 人脸解析实例 self.face_parse=FaceParseApp(self.face_pose_kmodel,model_input_size=self.parse_input_size,rgb888p_size=self.rgb888p_size,display_size=self.display_size) # 人脸检测预处理配置 self.face_det.config_preprocess() # run函数 def run(self,input_np): # 执行人脸检测 det_boxes=self.face_det.run(input_np) parse_res=[] for det_box in det_boxes: # 对检测到每一个人脸进行人脸解析 self.face_parse.config_preprocess(det_box) res=self.face_parse.run(input_np) parse_res.append(res) return det_boxes,parse_res # 绘制人脸解析效果 def draw_result(self,pl,dets,parse_res): pl.osd_img.clear() if dets: draw_img_np = np.zeros((self.display_size[1],self.display_size[0],4),dtype=np.uint8) draw_img=image.Image(self.display_size[0], self.display_size[1], image.ARGB8888,alloc=image.ALLOC_REF,data=draw_img_np) for i,det in enumerate(dets): # （1）将人脸检测框画到draw_img x, y, w, h = map(lambda x: int(round(x, 0)), det[:4]) x = x * self.display_size[0] // self.rgb888p_size[0] y = y * self.display_size[1] // self.rgb888p_size[1] w = w * self.display_size[0] // self.rgb888p_size[0] h = h * self.display_size[1] // self.rgb888p_size[1] aidemo.face_parse_post_process(draw_img_np,self.rgb888p_size,self.display_size,self.parse_input_size[0],det.tolist(),parse_res[i]) pl.osd_img.copy_from(draw_img) if name=="main": # 显示模式，默认"hdmi",可以选择"hdmi"和"lcd"，k230d受限于内存不支持 display_mode="hdmi" if display_mode=="hdmi": display_size=[1920,1080] else: display_size=[800,480] # 人脸检测模型路径 face_det_kmodel_path="/sdcard/examples/kmodel/face_detection_320.kmodel" # 人脸解析模型路径 face_parse_kmodel_path="/sdcard/examples/kmodel/face_parse.kmodel" # 其他参数 anchors_path="/sdcard/examples/utils/prior_data_320.bin" rgb888p_size=[1920,1080] face_det_input_size=[320,320] face_parse_input_size=[320,320] confidence_threshold=0.5 nms_threshold=0.2 anchor_len=4200 det_dim=4 anchors = np.fromfile(anchors_path, dtype=np.float) anchors = anchors.reshape((anchor_len,det_dim)) # 初始化PipeLine，只关注传给AI的图像分辨率，显示的分辨率 pl=PipeLine(rgb888p_size=rgb888p_size,display_size=display_size,display_mode=display_mode) pl.create() fp=FaceParse(face_det_kmodel_path,face_parse_kmodel_path,det_input_size=face_det_input_size,parse_input_size=face_parse_input_size,anchors=anchors,confidence_threshold=confidence_threshold,nms_threshold=nms_threshold,rgb888p_size=rgb888p_size,display_size=display_size) try: while True: os.exitpoint() with ScopedTiming("total",1): img=pl.get_frame() # 获取当前帧 det_boxes,parse_res=fp.run(img) # 推理当前帧 fp.draw_result(pl,det_boxes,parse_res) # 绘制当前帧推理结果 pl.show_image() # 展示推理效果 gc.collect() except Exception as e: sys.print_exception(e) finally: fp.face_det.deinit() fp.face_parse.deinit() pl.destroy() 以上是全部代码，请帮我解决问题 Traceback (most recent call last): File "<stdin>", line 156, in <module> Exception: IDE interrupt MPY: soft reboot CanMV v1.2.2(based on Micropython e00a144) on 2025-06-16; k230_canmv_lckfb with K230

faces = img.find_features(image.HaarCascade("frontalface")) for face in faces: img.draw_rectangle(face) 如果这段代码在运行中导致软重启，我们可以尝试以下调整： - **减少图像分辨率**：将...

from libs.PipeLine import PipeLine, ScopedTiming from libs.AIBase import AIBase from libs.AI2D import Ai2d import os import ujson from media.media import * from time import * from machine import UART from machine import FPIOA import nncase_runtime as nn import ulab.numpy as np import time import image import aicube import random import gc import sys # 自定义OCR检测类 class OCRDetectionApp(AIBase): def init(self,kmodel_path,model_input_size,mask_threshold=0.5,box_threshold=0.2,rgb888p_size=[224,224],display_size=[1920,1080],debug_mode=0): super().init(kmodel_path,model_input_size,rgb888p_size,debug_mode) self.kmodel_path=kmodel_path # 模型输入分辨率 self.model_input_size=model_input_size # 分类阈值 self.mask_threshold=mask_threshold self.box_threshold=box_threshold # sensor给到AI的图像分辨率 self.rgb888p_size=[ALIGN_UP(rgb888p_size[0],16),rgb888p_size[1]] # 显示分辨率 self.display_size=[ALIGN_UP(display_size[0],16),display_size[1]] self.debug_mode=debug_mode # Ai2d实例，用于实现模型预处理 self.ai2d=Ai2d(debug_mode) # 设置Ai2d的输入输出格式和类型 self.ai2d.set_ai2d_dtype(nn.ai2d_format.NCHW_FMT,nn.ai2d_format.NCHW_FMT,np.uint8, np.uint8) # 配置预处理操作，这里使用了pad和resize，Ai2d支持crop/shift/pad/resize/affine，具体代码请打开/sdcard/app/libs/AI2D.py查看 def config_preprocess(self,input_image_size=None): with ScopedTiming("set preprocess config",self.debug_mode > 0): # 初始化ai2d预处理配置，默认为sensor给到AI的尺寸，您可以通过设置input_image_size自行修改输入尺寸 ai2d_input_size=input_image_size if input_image_size else self.rgb888p_size top,bottom,left,right=self.get_padding_param() self.ai2d.pad([0,0,0,0,top,bottom,left,right], 0, [0,0,0]) self.ai2d.resize(nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) self.ai2d.build([1,3,ai2d_input_size[1],ai2d_input_size[0]],[1,3,self.model_input_size[1],self.model_input_size[0]]) # 自定义当前任务的后处理 def postprocess(self,results): with ScopedTiming("postprocess",self.debug_mode > 0): # chw2hwc hwc_array=self.chw2hwc(self.cur_img) # 这里使用了aicube封装的接口ocr_post_process做后处理,返回的det_boxes结构为[[crop_array_nhwc,[p1_x,p1_y,p2_x,p2_y,p3_x,p3_y,p4_x,p4_y]],...] det_boxes = aicube.ocr_post_process(results[0][:,:,:,0].reshape(-1), hwc_array.reshape(-1),self.model_input_size,self.rgb888p_size, self.mask_threshold, self.box_threshold) return det_boxes # 计算padding参数 def get_padding_param(self): # 右padding或下padding dst_w = self.model_input_size[0] dst_h = self.model_input_size[1] input_width = self.rgb888p_size[0] input_high = self.rgb888p_size[1] ratio_w = dst_w / input_width ratio_h = dst_h / input_high if ratio_w < ratio_h: ratio = ratio_w else: ratio = ratio_h new_w = (int)(ratio * input_width) new_h = (int)(ratio * input_high) dw = (dst_w - new_w) / 2 dh = (dst_h - new_h) / 2 top = (int)(round(0)) bottom = (int)(round(dh * 2 + 0.1)) left = (int)(round(0)) right = (int)(round(dw * 2 - 0.1)) return top, bottom, left, right # chw2hwc def chw2hwc(self,features): ori_shape = (features.shape[0], features.shape[1], features.shape[2]) c_hw_ = features.reshape((ori_shape[0], ori_shape[1] * ori_shape[2])) hw_c_ = c_hw_.transpose() new_array = hw_c_.copy() hwc_array = new_array.reshape((ori_shape[1], ori_shape[2], ori_shape[0])) del c_hw_ del hw_c_ del new_array return hwc_array # 自定义OCR识别任务类 class OCRRecognitionApp(AIBase): def init(self,kmodel_path,model_input_size,dict_path,rgb888p_size=[1920,1080],display_size=[1920,1080],debug_mode=0): super().init(kmodel_path,model_input_size,rgb888p_size,debug_mode) # kmodel路径 self.kmodel_path=kmodel_path # 识别模型输入分辨率 self.model_input_size=model_input_size self.dict_path=dict_path # sensor给到AI的图像分辨率，宽16字节对齐 self.rgb888p_size=[ALIGN_UP(rgb888p_size[0],16),rgb888p_size[1]] # 视频输出VO分辨率，宽16字节对齐 self.display_size=[ALIGN_UP(display_size[0],16),display_size[1]] # debug模式 self.debug_mode=debug_mode self.dict_word=None # 读取OCR的字典 self.read_dict() self.ai2d=Ai2d(debug_mode) self.ai2d.set_ai2d_dtype(nn.ai2d_format.RGB_packed,nn.ai2d_format.NCHW_FMT,np.uint8, np.uint8) # 配置预处理操作，这里使用了pad和resize，Ai2d支持crop/shift/pad/resize/affine，具体代码请打开/sdcard/app/libs/AI2D.py查看 def config_preprocess(self,input_image_size=None,input_np=None): with ScopedTiming("set preprocess config",self.debug_mode > 0): ai2d_input_size=input_image_size if input_image_size else self.rgb888p_size top,bottom,left,right=self.get_padding_param(ai2d_input_size,self.model_input_size) self.ai2d.pad([0,0,0,0,top,bottom,left,right], 0, [0,0,0]) self.ai2d.resize(nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) # 如果传入input_np，输入shape为input_np的shape,如果不传入，输入shape为[1,3,ai2d_input_size[1],ai2d_input_size[0]] self.ai2d.build([input_np.shape[0],input_np.shape[1],input_np.shape[2],input_np.shape[3]],[1,3,self.model_input_size[1],self.model_input_size[0]]) # 自定义后处理，results是模型输出的array列表 def postprocess(self,results): with ScopedTiming("postprocess",self.debug_mode > 0): preds = np.argmax(results[0], axis=2).reshape((-1)) output_txt = "" for i in range(len(preds)): # 当前识别字符不是字典的最后一个字符并且和前一个字符不重复（去重），加入识别结果字符串 if preds[i] != (len(self.dict_word) - 1) and (not (i > 0 and preds[i - 1] == preds[i])): output_txt = output_txt + self.dict_word[preds[i]] return output_txt # 计算padding参数 def get_padding_param(self,src_size,dst_size): # 右padding或下padding dst_w = dst_size[0] dst_h = dst_size[1] input_width = src_size[0] input_high = src_size[1] ratio_w = dst_w / input_width ratio_h = dst_h / input_high if ratio_w < ratio_h: ratio = ratio_w else: ratio = ratio_h new_w = (int)(ratio * input_width) new_h = (int)(ratio * input_high) dw = (dst_w - new_w) / 2 dh = (dst_h - new_h) / 2 top = (int)(round(0)) bottom = (int)(round(dh * 2 + 0.1)) left = (int)(round(0)) right = (int)(round(dw * 2 - 0.1)) return top, bottom, left, right def read_dict(self): if self.dict_path!="": with open(dict_path, 'r') as file: line_one = file.read(100000) line_list = line_one.split("\r\n") self.dict_word = {num: char.replace("\r", "").replace("\n", "") for num, char in enumerate(line_list)} class OCRDetRec: def init(self,ocr_det_kmodel,ocr_rec_kmodel,det_input_size,rec_input_size,dict_path,mask_threshold=0.25,box_threshold=0.3,rgb888p_size=[1920,1080],display_size=[1920,1080],debug_mode=0): # OCR检测模型路径 self.ocr_det_kmodel=ocr_det_kmodel # OCR识别模型路径 self.ocr_rec_kmodel=ocr_rec_kmodel # OCR检测模型输入分辨率 self.det_input_size=det_input_size # OCR识别模型输入分辨率 self.rec_input_size=rec_input_size # 字典路径 self.dict_path=dict_path # 置信度阈值 self.mask_threshold=mask_threshold # nms阈值 self.box_threshold=box_threshold # sensor给到AI的图像分辨率，宽16字节对齐 self.rgb888p_size=[ALIGN_UP(rgb888p_size[0],16),rgb888p_size[1]] # 视频输出VO分辨率，宽16字节对齐 self.display_size=[ALIGN_UP(display_size[0],16),display_size[1]] # debug_mode模式 self.debug_mode=debug_mode self.ocr_det=OCRDetectionApp(self.ocr_det_kmodel,model_input_size=self.det_input_size,mask_threshold=self.mask_threshold,box_threshold=self.box_threshold,rgb888p_size=self.rgb888p_size,display_size=self.display_size,debug_mode=0) self.ocr_rec=OCRRecognitionApp(self.ocr_rec_kmodel,model_input_size=self.rec_input_size,dict_path=self.dict_path,rgb888p_size=self.rgb888p_size,display_size=self.display_size) self.ocr_det.config_preprocess() self.prev_numbers = "" # 存储上次识别的数字 # run函数 def run(self,input_np): # 先进行OCR检测 det_res=self.ocr_det.run(input_np) boxes=[] ocr_res=[] for det in det_res: # 对得到的每个检测框执行OCR识别 self.ocr_rec.config_preprocess(input_image_size=[det[0].shape[2],det[0].shape[1]],input_np=det[0]) ocr_str=self.ocr_rec.run(det[0]) ocr_res.append(ocr_str) boxes.append(det[1]) gc.collect() # 新增：提取数字并发送 numbers = self.extract_numbers(ocr_res) self.send_numbers_via_uart(numbers) return boxes,ocr_res def extract_numbers(self, rec_res): """从识别结果中提取数字""" numbers = "" for text in rec_res: # 过滤非数字字符（保留小数点） for char in text: if char.isdigit() or char == '.': numbers += char return numbers def send_numbers_via_uart(self, numbers): """通过串口发送数字，避免重复发送相同内容""" if numbers and numbers != self.prev_numbers: # 添加起始和结束标记 data = f"<{numbers}>\n" uart.write(data.encode('utf-8')) print("Sent via UART:", data) self.prev_numbers = numbers # 绘制OCR检测识别效果 def draw_result(self,pl,det_res,rec_res): pl.osd_img.clear() if det_res: # 循环绘制所有检测到的框 for j in range(len(det_res)): # 将原图的坐标点转换成显示的坐标点，循环绘制四条直线，得到一个矩形框 for i in range(4): x1 = det_res[j][(i * 2)] / self.rgb888p_size[0] * self.display_size[0] y1 = det_res[j][(i * 2 + 1)] / self.rgb888p_size[1] * self.display_size[1] x2 = det_res[j][((i + 1) * 2) % 8] / self.rgb888p_size[0] * self.display_size[0] y2 = det_res[j][((i + 1) * 2 + 1) % 8] / self.rgb888p_size[1] * self.display_size[1] pl.osd_img.draw_line((int(x1), int(y1), int(x2), int(y2)), color=(255, 0, 0, 255),thickness=5) pl.osd_img.draw_string_advanced(int(x1),int(y1),32,rec_res[j],color=(0,0,255)) if name=="main": # 显示模式，默认"hdmi",可以选择"hdmi"和"lcd"，k230d受限内存不支持 display_mode="lcd" if display_mode=="hdmi": display_size=[1920,1080] else: display_size=[800,480] # OCR检测模型路径 ocr_det_kmodel_path="/sdcard/examples/kmodel/ocr_det_int16.kmodel" # OCR识别模型路径 ocr_rec_kmodel_path="/sdcard/examples/kmodel/ocr_rec_int16.kmodel" # 其他参数 dict_path="/sdcard/examples/utils/dict.txt" rgb888p_size=[640,360] ocr_det_input_size=[640,640] ocr_rec_input_size=[512,32] mask_threshold=0.25 box_threshold=0.3 # 初始化PipeLine，只关注传给AI的图像分辨率，显示的分辨率 pl=PipeLine(rgb888p_size=rgb888p_size,display_size=display_size,display_mode=display_mode) pl.create() # 实例化FPIOA from machine import FPIOA fpioa = FPIOA() #设置pin44为串口2发送管脚 fpioa.set_function(28, fpioa.UART3_TXD) #设置pin45为串口2接收管脚 fpioa.set_function(29, fpioa.UART3_RXD) #UART: baudrate 115200, 8bits, parity none, one stopbits uart = UART(UART.UART3, baudrate=115200, bits=UART.EIGHTBITS, parity=UART.PARITY_NONE, stop=UART.STOPBITS_ONE) ocr=OCRDetRec(ocr_det_kmodel_path,ocr_rec_kmodel_path,det_input_size=ocr_det_input_size,rec_input_size=ocr_rec_input_size,dict_path=dict_path,mask_threshold=mask_threshold,box_threshold=box_threshold,rgb888p_size=rgb888p_size,display_size=display_size) try: while True: os.exitpoint() with ScopedTiming("total",1): img=pl.get_frame() # 获取当前帧 det_res,rec_res=ocr.run(img) # 推理当前帧 ocr.draw_result(pl,det_res,rec_res) # 绘制当前帧推理结果 pl.show_image() # 展示当前帧推理结果 gc.collect() except Exception as e: sys.print_exception(e) finally: ocr.ocr_det.deinit() ocr.ocr_rec.deinit() pl.destroy()如何让这段代码识别近处的数字且只识别数字

image = Image.open('image.jpg') text = pytesseract.image_to_string(image) print(text) 调整后的代码： import pytesseract from PIL import Image import re # 打开图片 image = Image.open('image....

'''实验名称：人脸检测实验平台：01Studio CanMV K230教程：wiki.01studio.cc'''from media.sensor import * #导入sensor模块，使用摄像头相关接口from libs.PipeLine import PipeLine, ScopedTimingfrom libs.AIBase import AIBasefrom libs.AI2D import Ai2dimport osimport ujsonfrom media.media import from time import import nncase_runtime as nnimport ulab.numpy as npimport timeimport utimeimport imageimport randomimport gcimport sysimport aidemo# 自定义人脸检测类，继承自AIBase基类class FaceDetectionApp(AIBase): def init(self, kmodel_path, model_input_size, anchors, confidence_threshold=0.5, nms_threshold=0.2, rgb888p_size=[224,224], display_size=[1920,1080], debug_mode=0): super().init(kmodel_path, model_input_size, rgb888p_size, debug_mode) # 调用基类的构造函数 self.kmodel_path = kmodel_path # 模型文件路径 self.model_input_size = model_input_size # 模型输入分辨率 self.confidence_threshold = confidence_threshold # 置信度阈值 self.nms_threshold = nms_threshold # NMS（非极大值抑制）阈值 self.anchors = anchors # 锚点数据，用于目标检测 self.rgb888p_size = [ALIGN_UP(rgb888p_size[0], 16), rgb888p_size[1]] # sensor给到AI的图像分辨率，并对宽度进行16的对齐 self.display_size = [ALIGN_UP(display_size[0], 16), display_size[1]] # 显示分辨率，并对宽度进行16的对齐 self.debug_mode = debug_mode # 是否开启调试模式 self.ai2d = Ai2d(debug_mode) # 实例化Ai2d，用于实现模型预处理 self.ai2d.set_ai2d_dtype(nn.ai2d_format.NCHW_FMT, nn.ai2d_format.NCHW_FMT, np.uint8, np.uint8) # 设置Ai2d的输入输出格式和类型 # 配置预处理操作，这里使用了pad和resize，Ai2d支持crop/shift/pad/resize/affine，具体代码请打开/sdcard/app/libs/AI2D.py查看 def config_preprocess(self, input_image_size=None): with ScopedTiming("set preprocess config", self.debug_mode > 0): # 计时器，如果debug_mode大于0则开启 ai2d_input_size = input_image_size if input_image_size else self.rgb888p_size # 初始化ai2d预处理配置，默认为sensor给到AI的尺寸，可以通过设置input_image_size自行修改输入尺寸 top, bottom, left, right = self.get_padding_param() # 获取padding参数 self.ai2d.pad([0, 0, 0, 0, top, bottom, left, right], 0, [104, 117, 123]) # 填充边缘 self.ai2d.resize(nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) # 缩放图像 self.ai2d.build([1,3,ai2d_input_size[1],ai2d_input_size[0]],[1,3,self.model_input_size[1],self.model_input_size[0]]) # 构建预处理流程 # 自定义当前任务的后处理，results是模型输出array列表，这里使用了aidemo库的face_det_post_process接口 def postprocess(self, results): with ScopedTiming("postprocess", self.debug_mode > 0): post_ret = aidemo.face_det_post_process(self.confidence_threshold, self.nms_threshold, self.model_input_size[1], self.anchors, self.rgb888p_size, results) if len(post_ret) == 0: return post_ret else: return post_ret[0] # 绘制检测结果到画面上 def draw_result(self, pl, dets): with ScopedTiming("display_draw", self.debug_mode > 0): if dets: pl.osd_img.clear() # 清除OSD图像 for det in dets: # 将检测框的坐标转换为显示分辨率下的坐标 x, y, w, h = map(lambda x: int(round(x, 0)), det[:4]) x = x * self.display_size[0] // self.rgb888p_size[0] y = y * self.display_size[1] // self.rgb888p_size[1] w = w * self.display_size[0] // self.rgb888p_size[0] h = h * self.display_size[1] // self.rgb888p_size[1] pl.osd_img.draw_rectangle(x, y, w, h, color=(255, 255, 0, 255), thickness=2) # 绘制矩形框 else: pl.osd_img.clear() # 获取padding参数 def get_padding_param(self): dst_w = self.model_input_size[0] # 模型输入宽度 dst_h = self.model_input_size[1] # 模型输入高度 ratio_w = dst_w / self.rgb888p_size[0] # 宽度缩放比例 ratio_h = dst_h / self.rgb888p_size[1] # 高度缩放比例 ratio = min(ratio_w, ratio_h) # 取较小的缩放比例 new_w = int(ratio * self.rgb888p_size[0]) # 新宽度 new_h = int(ratio * self.rgb888p_size[1]) # 新高度 dw = (dst_w - new_w) / 2 # 宽度差 dh = (dst_h - new_h) / 2 # 高度差 top = int(round(0)) bottom = int(round(dh * 2 + 0.1)) left = int(round(0)) right = int(round(dw * 2 - 0.1)) return top, bottom, left, rightif name == "main": # 显示模式，默认"hdmi",可以选择"hdmi"和"lcd" display_mode="lcd" if display_mode=="hdmi": display_size=[1920,1080] else: display_size=[800,480] # 设置模型路径和其他参数 kmodel_path = "/sdcard/examples/kmodel/face_detection_320.kmodel" # 其它参数 confidence_threshold = 0.5 nms_threshold = 0.2 anchor_len = 4200 det_dim = 4 anchors_path = "/sdcard/examples/utils/prior_data_320.bin" anchors = np.fromfile(anchors_path, dtype=np.float) anchors = anchors.reshape((anchor_len, det_dim)) rgb888p_size = [1920, 1080] # 初始化PipeLine，用于图像处理流程 pl = PipeLine(rgb888p_size=rgb888p_size, display_size=display_size, display_mode=display_mode) pl.create() # 创建PipeLine实例 # 初始化自定义人脸检测实例 face_det = FaceDetectionApp(kmodel_path, model_input_size=[320, 320], anchors=anchors, confidence_threshold=confidence_ 由此实例添加摄像头翻转代码

import image import lcd #### 2. **初始化摄像头后设置翻转参数** 在摄像头初始化代码后添加翻转设置： python # 摄像头初始化 sensor.reset() sensor.set_pixformat(sensor.RGB565) sensor.set_framesize...

# === 自定义人脸检测类 === class FaceDetectionApp(AIBase): """自定义人脸检测应用类。""" def init(self, kmodel_path, model_input_size, anchors, confidence_threshold=0.5, nms_threshold=0.2, rgb888p_size=[224, 224], display_size=[800, 480], debug_mode=0): super().init(kmodel_path, model_input_size, rgb888p_size, debug_mode) self.kmodel_path = kmodel_path self.model_input_size = model_input_size self.confidence_threshold = confidence_threshold self.nms_threshold = nms_threshold self.anchors = anchors self.rgb888p_size = [ALIGN_UP(rgb888p_size[0], 16), rgb888p_size[1]] self.display_size = [ALIGN_UP(display_size[0], 16), display_size[1]] self.debug_mode = debug_mode self.ai2d = Ai2d(debug_mode) self.ai2d.set_ai2d_dtype(nn.ai2d_format.NCHW_FMT, nn.ai2d_format.NCHW_FMT, np.uint8, np.uint8) def config_preprocess(self, input_image_size=None): """配置预处理参数。""" with ScopedTiming("set preprocess config", self.debug_mode > 0): ai2d_input_size = input_image_size if input_image_size else self.rgb888p_size top, bottom, left, right = self.get_padding_param() self.ai2d.pad([0, 0, 0, 0, top, bottom, left, right], 0, [104, 117, 123]) self.ai2d.resize(nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) self.ai2d.build([1, 3, ai2d_input_size[1], ai2d_input_size[0]], [1, 3, self.model_input_size[1], self.model_input_size[0]]) def postprocess(self, results): """后处理检测结果。""" with ScopedTiming("postprocess", self.debug_mode > 0): post_ret = aidemo.face_det_post_process(self.confidence_threshold, self.nms_threshold, self.model_input_size[1], self.anchors, self.rgb888p_size, results) if len(post_ret) == 0: return post_ret else: return post_ret[0] def draw_result(self, pl, dets): """在屏幕上绘制检测结果。""" with ScopedTiming("display_draw", self.debug_mode > 0): if dets: pl.osd_img.clear() faces = [] for det in dets: x, y, w, h = map(lambda x: int(round(x, 0)), det[:4]) x = x * self.display_size[0] // self.rgb888p_size[0] y = y * self.display_size[1] // self.rgb888p_size[1] w = w * self.display_size[0] // self.rgb888p_size[0] h = h * self.display_size[1] // self.rgb888p_size[1] pl.osd_img.draw_rectangle(x, y, w, h, color=(255, 255, 0, 255), thickness=2) text_left_top = f"({x}, {y})" pl.osd_img.draw_string_advanced(x, y - 25, 20, text_left_top, color=(255, 255, 255)) faces.append((x, y, w, h)) return faces else: pl.osd_img.clear() return [] def get_padding_param(self): """获取填充参数。""" dst_w = self.model_input_size[0] dst_h = self.model_input_size[1] ratio_w = dst_w / self.rgb888p_size[0] ratio_h = dst_h / self.rgb888p_size[1] ratio = min(ratio_w, ratio_h) new_w = int(ratio * self.rgb888p_size[0]) new_h = int(ratio * self.rgb888p_size[1]) dw = (dst_w - new_w) / 2 dh = (dst_h - new_h) / 2 top = int(round(0)) bottom = int(round(dh * 2 + 0.1)) left = int(round(0)) right = int(round(dw * 2 - 0.1)) return top, bottom, left, right def two_side_pad_param(input_size, output_size): ratio_w = output_size[0] / input_size[0] # 宽度缩放比例 ratio_h = output_size[1] / input_size[1] # 高度缩放比例 ratio = min(ratio_w, ratio_h) # 取较小的缩放比例 new_w = int(ratio * input_size[0]) # 新宽度 new_h = int(ratio * input_size[1]) # 新高度 dw = (output_size[0] - new_w) / 2 # 宽度差 dh = (output_size[1] - new_h) / 2 # 高度差 top = int(round(dh - 0.1)) bottom = int(round(dh + 0.1)) left = int(round(dw - 0.1)) right = int(round(dw - 0.1)) return top, bottom, left, right, ratio def read_deploy_config(config_path): # 打开JSON文件以进行读取deploy_config with open(config_path, "r") as json_file: try: # 从文件中加载JSON数据 config = ujson.load(json_file) except ValueError as e: print("JSON 解析错误:", e) return config def detection(): print("det_infer start") # 使用json读取内容初始化部署变量 deploy_conf = read_deploy_config(config_path) kmodel_name = deploy_conf["kmodel_path"] labels = deploy_conf["categories"] confidence_threshold = deploy_conf["confidence_threshold"] nms_threshold = deploy_conf["nms_threshold"] img_size = deploy_conf["img_size"] num_classes = deploy_conf["num_classes"] color_four = get_colors(num_classes) nms_option = deploy_conf["nms_option"] model_type = deploy_conf["model_type"] if model_type == "AnchorBaseDet": anchors = deploy_conf["anchors"][0] + deploy_conf["anchors"][1] + deploy_conf["anchors"][2] kmodel_frame_size = img_size frame_size = [OUT_RGB888P_WIDTH, OUT_RGB888P_HEIGH] strides = [8, 16, 32] # 计算padding值 top, bottom, left, right, ratio = two_side_pad_param(frame_size, kmodel_frame_size) # 初始化kpu kpu = nn.kpu() kpu.load_kmodel(root_path + kmodel_name) # 初始化ai2d ai2d = nn.ai2d() ai2d.set_dtype(nn.ai2d_format.NCHW_FMT, nn.ai2d_format.NCHW_FMT, np.uint8, np.uint8) ai2d.set_pad_param(True, [0, 0, 0, 0, top, bottom, left, right], 0, [114, 114, 114]) ai2d.set_resize_param(True, nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) ai2d_builder = ai2d.build( [1, 3, OUT_RGB888P_HEIGH, OUT_RGB888P_WIDTH], [1, 3, kmodel_frame_size[1], kmodel_frame_size[0]] ) # 初始化并配置sensor sensor = Sensor() sensor.reset() # 设置镜像 sensor.set_hmirror(False) # 设置翻转 sensor.set_vflip(False) # 通道0直接给到显示VO，格式为YUV420 sensor.set_framesize(width=DISPLAY_WIDTH, height=DISPLAY_HEIGHT) sensor.set_pixformat(PIXEL_FORMAT_YUV_SEMIPLANAR_420) # 通道2给到AI做算法处理，格式为RGB888 sensor.set_framesize(width=OUT_RGB888P_WIDTH, height=OUT_RGB888P_HEIGH, chn=CAM_CHN_ID_2) sensor.set_pixformat(PIXEL_FORMAT_RGB_888_PLANAR, chn=CAM_CHN_ID_2) # 绑定通道0的输出到vo sensor_bind_info = sensor.bind_info(x=0, y=0, chn=CAM_CHN_ID_0) Display.bind_layer(**sensor_bind_info, layer=Display.LAYER_VIDEO1) if display_mode == "lcd": # 设置为ST7701显示，默认800x480 Display.init(Display.ST7701, to_ide=True) else: # 设置为LT9611显示，默认1920x1080 Display.init(Display.LT9611, to_ide=True) # 创建OSD图像 osd_img = image.Image(DISPLAY_WIDTH, DISPLAY_HEIGHT, image.ARGB8888) # media初始化 MediaManager.init() # 启动sensor sensor.run() rgb888p_img = None ai2d_input_tensor = None data = np.ones((1, 3, kmodel_frame_size[1], kmodel_frame_size[0]), dtype=np.uint8) ai2d_output_tensor = nn.from_numpy(data) while True: with ScopedTiming("total", debug_mode > 0): rgb888p_img = sensor.snapshot(chn=CAM_CHN_ID_2) if rgb888p_img.format() == image.RGBP888: ai2d_input = rgb888p_img.to_numpy_ref() ai2d_input_tensor = nn.from_numpy(ai2d_input) # 使用ai2d进行预处理 ai2d_builder.run(ai2d_input_tensor, ai2d_output_tensor) # 设置模型输入 kpu.set_input_tensor(0, ai2d_output_tensor) # 模型推理 kpu.run() # 获取模型输出 results = [] for i in range(kpu.outputs_size()): out_data = kpu.get_output_tensor(i) result = out_data.to_numpy() result = result.reshape((result.shape[0] * result.shape[1] * result.shape[2] * result.shape[3])) del out_data results.append(result) # 使用aicube模块封装的接口进行后处理 det_boxes = aicube.anchorbasedet_post_process( results[0], results[1], results[2], kmodel_frame_size, frame_size, strides, num_classes, confidence_threshold, nms_threshold, anchors, nms_option, ) # 绘制结果 osd_img.clear() if det_boxes: for det_boxe in det_boxes: x1, y1, x2, y2 = det_boxe[2], det_boxe[3], det_boxe[4], det_boxe[5] x = int(x1 * DISPLAY_WIDTH // OUT_RGB888P_WIDTH) y = int(y1 * DISPLAY_HEIGHT // OUT_RGB888P_HEIGH) w = int((x2 - x1) * DISPLAY_WIDTH // OUT_RGB888P_WIDTH) h = int((y2 - y1) * DISPLAY_HEIGHT // OUT_RGB888P_HEIGH) osd_img.draw_rectangle(x, y, w, h, color=color_four[det_boxe[0]][1:]) text = labels[det_boxe[0]] + " " + str(round(det_boxe[1], 2)) osd_img.draw_string_advanced(x, y - 40, 32, text, color=color_four[det_boxe[0]][1:])解释两段代码并分析相同之处和不同之处

我们被要求分析两段代码的异同，但是用户提供的代码只有一段（舵机控制和PID控制的部分），另一段代码在哪里？根据上下文，用户可能指的是之前提到的“红色圆柱检测类”中的代码和舵机控制代码？...

resized_img = img.resize((new_width, new_height), Image.LANCZOS)

resized = tf.image.resize(img, [new_h, new_w], method='lanczos3') - **PyTorch**： python from torchvision.transforms import Resize transform = Resize((new_h, new_w), interpolation=...

我的是医学影像分类任务，根据你的说明，grad-cam必须叠加在预处理前的原始图像中，结合我的预处理代码和加载模型，class SyncAffined(MapTransform): def init(self, keys, atol=1e-5, logger=None): super().init(keys) self.orientation = Orientationd(keys=keys, axcodes="RAS") self.resample = ResampleToMatchd(keys=["mask"], key_dst="image", mode="nearest") self.atol = atol # 设置容差值 self.logger = logger # 设置日志记录器 def call(self, data): try: # 保存原始 affine 到 meta_dict data["image_meta_dict"]["original_affine"] = data["original_affine"] data["mask_meta_dict"]["original_affine"] = data["mask_original_affine"] # 执行方向对齐 data = self.orientation(data) # 提取仿射矩阵 image_affine = data["image_meta_dict"]["affine"] mask_affine = data["mask_meta_dict"]["affine"] # 确保仿射矩阵是NumPy数组 if isinstance(image_affine, torch.Tensor): image_affine = image_affine.numpy() if isinstance(mask_affine, torch.Tensor): mask_affine = mask_affine.numpy() # 如果仿射矩阵不一致且差异大于容差，则重采样掩膜 if not np.allclose(image_affine, mask_affine, atol=self.atol): if self.logger: diff = np.abs(image_affine - mask_affine).max() self.logger.warning(f"⚠️ affine 不一致 (最大差异: {diff:.2e})，重采样掩膜：{data.get('id', 'unknown')}") data = self.resample(data) # 更新重采样后的 affine data["mask_meta_dict"]["affine"] = data["image_meta_dict"]["affine"].clone() return data except Exception as e: if self.logger: self.logger.error(f"Error during SyncAffined processing: {e}") raise class RecordSpatialInfo(MapTransform): def init(self, keys): super().init(keys) self.keys = keys def call(self, data): for key in self.keys: meta_key = f"{key}_meta_dict" meta = data.get(meta_key, {}) # 原始 shape 来源于 tensor 的 shape（排除 channel 维度） img = data.get(key) if isinstance(img, torch.Tensor): processed_shape = np.array(img.shape[1:]) # (C, D, H, W) → (D, H, W) else: processed_shape = np.array(meta.get("spatial_shape", (1, 1, 1))) # 获取原始 affine 和形状 original_affine = meta.get("original_affine", np.eye(4)) original_shape = meta.get("original_shape", processed_shape) # 计算处理后的 affine processed_affine = meta.get("affine", np.eye(4)) # ✅ 写入 image_meta_dict 中 data[meta_key]["original_shape"] = original_shape data[meta_key]["original_affine"] = original_affine data[meta_key]["processed_affine"] = processed_affine data[meta_key]["processed_shape"] = processed_shape # foreground 起始位置 data["crop_start"] = np.array(data.get("foreground_start_coord", [0, 0, 0])) return data def get_transforms(): deterministic_transforms = Compose([ LoadImaged(keys=["image", "mask"], image_only=False, reader="ITKReader"), EnsureChannelFirstd(keys=["image", "mask"]), SyncAffined(keys=["image", "mask"], atol=1e-10), Spacingd(keys=["image", "mask"], pixdim=(1.0, 1.0, 1.0), mode=("bilinear", "nearest")), CropForegroundd(keys=["image", "mask"], source_key="mask", margin=10), ResizeWithPadOrCropd(keys=["image", "mask"], spatial_size=(64, 64, 64)), RecordSpatialInfo(keys=["image", "mask"]), # 同时记录图像和掩膜的空间信息 ScaleIntensityRanged(keys=["image"], a_min=20, a_max=80, b_min=0.0, b_max=1.0, clip=True), EnsureTyped(keys=["image", "mask"], data_type="tensor"), ], map_items=True, overrides={"allow_missing_keys": True}) augmentation_transforms = Compose([ RandFlipd(keys=["image", "mask"], prob=0.2, spatial_axis=[0, 1, 2]), RandAffined( keys=["image", "mask"], prob=0.3, rotate_range=(-0.2, 0.2), scale_range=(0.8, 1.2), shear_range=(-0.1, 0.1, -0.1, 0.1, -0.1, 0.1), translate_range=(5, 5, 5), mode=("bilinear", "nearest"), padding_mode="border", spatial_size=(64, 64, 64) ), Lambdad(keys=["label"], func=lambda x: torch.tensor(x, dtype=torch.long)) ]) return deterministic_transforms, augmentation_transforms deterministic_transforms, augmentation_transforms = get_transforms()，生成grad-cam和原始图像叠加图。

具体步骤如下：步骤1：加载并保存预处理前的图像（即resize后的图像，但未归一化）pythonfromtensorflow.keras.preprocessingimportimageimportnumpyasnp#加载图像并调整大小img_path='path/to/medical_image.png...

import gc import os import time import math import aicube import image import nncase_runtime as nn import ujson import ulab.numpy as np from libs.PipeLine import ScopedTiming from libs.Utils import * from media.display import * from media.media import * from media.sensor import * # 显示模式设置 display_mode = "lcd" if display_mode == "lcd": DISPLAY_WIDTH = ALIGN_UP(800, 16) DISPLAY_HEIGHT = 480 else: DISPLAY_WIDTH = ALIGN_UP(1920, 16) DISPLAY_HEIGHT = 1080 # 图像处理尺寸 OUT_RGB888P_WIDTH = ALIGN_UP(640, 16) OUT_RGB888P_HEIGH = 360 # 配置文件路径 root_path = "/sdcard/mp_deployment_source/" config_path = root_path + "deploy_config.json" deploy_conf = {} debug_mode = 1 # 摄像头参数（需要根据实际摄像头校准） FOCAL_LENGTH = 600 # 焦距（像素单位） AVERAGE_OBJECT_HEIGHT = { # 常见物体的平均物理高度（单位：米） "person": 1.7, "car": 1.5, "chair": 0.5, # 添加更多物体类型... } def two_side_pad_param(input_size, output_size): """计算图像填充参数""" ratio_w = output_size[0] / input_size[0] ratio_h = output_size[1] / input_size[1] ratio = min(ratio_w, ratio_h) new_w = int(ratio * input_size[0]) new_h = int(ratio * input_size[1]) dw = (output_size[0] - new_w) / 2 dh = (output_size[1] - new_h) / 2 top = int(round(dh - 0.1)) bottom = int(round(dh + 0.1)) left = int(round(dw - 0.1)) right = int(round(dw - 0.1)) return top, bottom, left, right, ratio def read_deploy_config(config_path): """读取部署配置文件""" with open(config_path, "r") as json_file: try: config = ujson.load(json_file) except ValueError as e: print("JSON 解析错误:", e) return config def estimate_distance(box, label, frame_height): """ 估算物体到摄像头的距离基于物体高度和相似三角形原理 """ # 获取物体的预期物理高度（默认0.5米） object_height = AVERAGE_OBJECT_HEIGHT.get(label.lower(), 0.5) # 计算检测框的像素高度 pixel_height = box[5] - box[3] # y2 - y1 # 使用相似三角形计算距离：距离 = (实际高度 × 焦距) / 像素高度 distance = (object_height * FOCAL_LENGTH) / pixel_height return distance def detection(): print("det_infer start") # 读取部署配置 deploy_conf = read_deploy_config(config_path) kmodel_name = deploy_conf["kmodel_path"] labels = deploy_conf["categories"] confidence_threshold = deploy_conf["confidence_threshold"] nms_threshold = deploy_conf["nms_threshold"] img_size = deploy_conf["img_size"] num_classes = deploy_conf["num_classes"] color_four = get_colors(num_classes) nms_option = deploy_conf["nms_option"] model_type = deploy_conf["model_type"] if model_type == "AnchorBaseDet": anchors = deploy_conf["anchors"][0] + deploy_conf["anchors"][1] + deploy_conf["anchors"][2] kmodel_frame_size = img_size frame_size = [OUT_RGB888P_WIDTH, OUT_RGB888P_HEIGH] strides = [8, 16, 32] # 计算填充参数 top, bottom, left, right, ratio = two_side_pad_param(frame_size, kmodel_frame_size) # 初始化kpu kpu = nn.kpu() kpu.load_kmodel(root_path + kmodel_name) # 初始化ai2d ai2d = nn.ai2d() ai2d.set_dtype(nn.ai2d_format.NCHW_FMT, nn.ai2d_format.NCHW_FMT, np.uint8, np.uint8) ai2d.set_pad_param(True, [0, 0, 0, 0, top, bottom, left, right], 0, [114, 114, 114]) ai2d.set_resize_param(True, nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel) ai2d_builder = ai2d.build( [1, 3, OUT_RGB888P_HEIGH, OUT_RGB888P_WIDTH], [1, 3, kmodel_frame_size[1], kmodel_frame_size[0]] ) # 初始化传感器 sensor = Sensor() sensor.reset() sensor.set_hmirror(False) sensor.set_vflip(False) sensor.set_framesize(width=DISPLAY_WIDTH, height=DISPLAY_HEIGHT) sensor.set_pixformat(PIXEL_FORMAT_YUV_SEMIPLANAR_420) sensor.set_framesize(width=OUT_RGB888P_WIDTH, height=OUT_RGB888P_HEIGH, chn=CAM_CHN_ID_2) sensor.set_pixformat(PIXEL_FORMAT_RGB_888_PLANAR, chn=CAM_CHN_ID_2) # 绑定显示通道 sensor_bind_info = sensor.bind_info(x=0, y=0, chn=CAM_CHN_ID_0) Display.bind_layer(**sensor_bind_info, layer=Display.LAYER_VIDEO1) # 初始化显示设备 if display_mode == "lcd": Display.init(Display.ST7701, to_ide=True) else: Display.init(Display.LT9611, to_ide=True) # 创建OSD图像 osd_img = image.Image(DISPLAY_WIDTH, DISPLAY_HEIGHT, image.ARGB8888) # 初始化媒体管理器 MediaManager.init() # 启动传感器 sensor.run() rgb888p_img = None ai2d_input_tensor = None data = np.ones((1, 3, kmodel_frame_size[1], kmodel_frame_size[0]), dtype=np.uint8) ai2d_output_tensor = nn.from_numpy(data) # 主循环 while True: with ScopedTiming("total", debug_mode > 0): rgb888p_img = sensor.snapshot(chn=CAM_CHN_ID_2) if rgb888p_img.format() == image.RGBP888: ai2d_input = rgb888p_img.to_numpy_ref() ai2d_input_tensor = nn.from_numpy(ai2d_input) # 图像预处理 ai2d_builder.run(ai2d_input_tensor, ai2d_output_tensor) # 模型推理 kpu.set_input_tensor(0, ai2d_output_tensor) kpu.run() # 获取输出 results = [] for i in range(kpu.outputs_size()): out_data = kpu.get_output_tensor(i) result = out_data.to_numpy() result = result.reshape((result.shape[0] * result.shape[1] * result.shape[2] * result.shape[3])) del out_data results.append(result) # 后处理 det_boxes = aicube.anchorbasedet_post_process( results[0], results[1], results[2], kmodel_frame_size, frame_size, strides, num_classes, confidence_threshold, nms_threshold, anchors, nms_option, ) osd_img.clear() if det_boxes: for i, det_box in enumerate(det_boxes): x1, y1, x2, y2 = det_box[2], det_box[3], det_box[4], det_box[5] # 绘制检测框 x = int(x1 * DISPLAY_WIDTH // OUT_RGB888P_WIDTH) y = int(y1 * DISPLAY_HEIGHT // OUT_RGB888P_HEIGH) w = int((x2 - x1) * DISPLAY_WIDTH // OUT_RGB888P_WIDTH) h = int((y2 - y1) * DISPLAY_HEIGHT // OUT_RGB888P_HEIGH) osd_img.draw_rectangle(x, y, w, h, color=color_four[det_box[0]][1:]) # 估算物体距离 label = labels[det_box[0]] distance = estimate_distance(det_box, label, OUT_RGB888P_HEIGH) # 显示标签和距离 text = f"{label} {det_box[1]:.2f} | {distance:.2f}m" osd_img.draw_string_advanced(x, y - 40, 32, text, color=color_four[det_box[0]][1:]) Display.show_image(osd_img, 0, 0, Display.LAYER_OSD3) gc.collect() rgb888p_img = None # 清理资源 del ai2d_input_tensor del ai2d_output_tensor sensor.stop() Display.deinit() MediaManager.deinit() gc.collect() time.sleep(1) nn.shrink_memory_pool() print("det_infer end") return 0 if name == "main": detection() 给代码加入注释

我们被要求给一段Python代码添加详细注释，解释其功能和实现细节。根据用户提供的引用信息，注释应该： 1. 使用#进行单行注释，或在需要时使用多行注释（三个引号） 2. 注释应说明代码的功能和实现方法，而不是...

import os import ujson import aicube from libs.PipeLine import ScopedTiming from libs.Utils import * from media.sensor import * from media.display import * from media.media import * import nncase_runtime as nn import ulab.numpy as np import image import gc display_mode="lcd" if display_mode=="lcd": DISPLAY_WIDTH = ALIGN_UP(800, 16) DISPLAY_HEIGHT = 480 else: DISPLAY_WIDTH = ALIGN_UP(1920, 16) DISPLAY_HEIGHT = 1080 OUT_RGB888P_WIDTH = ALIGN_UP(1280, 16) OUT_RGB888P_HEIGH = 720 root_path="./sdcard/mp_deployment_source/" config_path=root_path+"deploy_config.json" deploy_conf={} debug_mode=1 def two_side_pad_param(input_size,output_size): ratio_w = output_size[0] / input_size[0] # 宽度缩放比例 ratio_h = output_size[1] / input_size[1] # 高度缩放比例 ratio = min(ratio_w, ratio_h) # 取较小的缩放比例 new_w = int(ratio * input_size[0]) # 新宽度 new_h = int(ratio * input_size[1]) # 新高度 dw = (output_size[0] - new_w) / 2 # 宽度差 dh = (output_size[1] - new_h) / 2 # 高度差 top = int(round(dh - 0.1)) bottom = int(round(dh + 0.1)) left = int(round(dw - 0.1)) right = int(round(dw - 0.1)) return top, bottom, left, right,ratio def read_deploy_config(config_path): # 打开JSON文件以进行读取deploy_config with open(config_path, 'r') as json_file: try: # 从文件中加载JSON数据 config = ujson.load(json_file) except ValueError as e: print("JSON 解析错误:", e) return config def detection(): print("det_infer start") # 使用json读取内容初始化部署变量 deploy_conf=read_deploy_config(config_path) kmodel_name=deploy_conf["kmodel_path"] labels=deploy_conf["categories"] confidence_threshold= deploy_conf["confidence_threshold"] nms_threshold = deploy_conf["nms_threshold"] img_size=deploy_conf["img_size"] num_classes=deploy_conf["num_classes"] color_four=get_colors(num_classes) nms_option = deploy_conf["nms_option"] model_type = deploy_conf["model_type"] if model_type == "AnchorBaseDet": anchors = deploy_conf["anchors"][0] + deploy_conf["anchors"][1] + deploy_conf["anchors"][2] kmodel_frame_size = img_size frame_size = [OUT_RGB888P_WIDTH,OUT_RGB888P_HEIGH] strides = [8,16,32] # 计算padding值 top, bottom, left, right,ratio=two_side_pad_param(frame_size,kmodel_frame_size) # 初始化kpu kpu = nn.kpu() kpu.load_kmodel(root_path+kmodel_name) # 初始化ai2d ai2d = nn.ai2d() ai2d.set_dtype(nn.ai2d_format.NCHW_FMT,nn.ai2d_format.NCHW_FMT,np.uint8, np.uint8) ai2d.set_pad_param(True, [0,0,0,0,top,bottom,left,right], 0, [114,114,114]) ai2d.set_resize_param(True, nn.interp_method.tf_bilinear, nn.interp_mode.half_pixel ) ai2d_builder = ai2d.build([1,3,OUT_RGB888P_HEIGH,OUT_RGB888P_WIDTH], [1,3,kmodel_frame_size[1],kmodel_frame_size[0]]) # 初始化并配置sensor sensor = Sensor() sensor.reset() # 设置镜像 sensor.set_hmirror(False) # 设置翻转 sensor.set_vflip(False) # 通道0直接给到显示VO，格式为YUV420 sensor.set_framesize(width = DISPLAY_WIDTH, height = DISPLAY_HEIGHT) sensor.set_pixformat(PIXEL_FORMAT_YUV_SEMIPLANAR_420) # 通道2给到AI做算法处理，格式为RGB888 sensor.set_framesize(width = OUT_RGB888P_WIDTH , height = OUT_RGB888P_HEIGH, chn=CAM_CHN_ID_2) sensor.set_pixformat(PIXEL_FORMAT_RGB_888_PLANAR, chn=CAM_CHN_ID_2) # 绑定通道0的输出到vo sensor_bind_info = sensor.bind_info(x = 0, y = 0, chn = CAM_CHN_ID_0) Display.bind_layer(**sensor_bind_info, layer = Display.LAYER_VIDEO1) if display_mode=="lcd": # 设置为ST7701显示，默认800x480 Display.init(Display.ST7701, to_ide = True) else: # 设置为LT9611显示，默认1920x1080 Display.init(Display.LT9611, to_ide = True) #创建OSD图像 osd_img = image.Image(DISPLAY_WIDTH, DISPLAY_HEIGHT, image.ARGB8888) # media初始化 MediaManager.init() # 启动sensor sensor.run() rgb888p_img = None ai2d_input_tensor = None data = np.ones((1,3,kmodel_frame_size[1],kmodel_frame_size[0]),dtype=np.uint8) ai2d_output_tensor = nn.from_numpy(data) while True: with ScopedTiming("total",debug_mode > 0): rgb888p_img = sensor.snapshot(chn=CAM_CHN_ID_2) if rgb888p_img.format() == image.RGBP888: ai2d_input = rgb888p_img.to_numpy_ref() ai2d_input_tensor = nn.from_numpy(ai2d_input) # 使用ai2d进行预处理 ai2d_builder.run(ai2d_input_tensor, ai2d_output_tensor) # 设置模型输入 kpu.set_input_tensor(0, ai2d_output_tensor) # 模型推理 kpu.run() # 获取模型输出 results = [] for i in range(kpu.outputs_size()): out_data = kpu.get_output_tensor(i) result = out_data.to_numpy() result = result.reshape((result.shape[0]result.shape[1]result.shape[2]result.shape[3])) del out_data results.append(result) # 使用aicube模块封装的接口进行后处理 det_boxes = aicube.anchorbasedet_post_process( results[0], results[1], results[2], kmodel_frame_size, frame_size, strides, num_classes, confidence_threshold, nms_threshold, anchors, nms_option) # 绘制结果 osd_img.clear() if det_boxes: for det_boxe in det_boxes: x1, y1, x2, y2 = det_boxe[2],det_boxe[3],det_boxe[4],det_boxe[5] x=int(x1 DISPLAY_WIDTH // OUT_RGB888P_WIDTH) y=int(y1 * DISPLAY_HEIGHT // OUT_RGB888P_HEIGH) w = int((x2 - x1) * DISPLAY_WIDTH // OUT_RGB888P_WIDTH) h = int((y2 - y1) * DISPLAY_HEIGHT // OUT_RGB888P_HEIGH) osd_img.draw_rectangle(x, y, w, h, color=color_four[det_boxe[0]][1:]) text=labels[det_boxe[0]] + " " + str(round(det_boxe[1],2)) osd_img.draw_string_advanced(x,y-40,32,text, color=color_four[det_boxe[0]][1:]) Display.show_image(osd_img, 0, 0, Display.LAYER_OSD3) gc.collect() rgb888p_img = None del ai2d_input_tensor del ai2d_output_tensor #停止摄像头输出 sensor.stop() #去初始化显示设备 Display.deinit() #释放媒体缓冲区 MediaManager.deinit() gc.collect() time.sleep(1) nn.shrink_memory_pool() print("det_infer end") return 0 if name=="main": detection() 这个是原代码，怎么修改路径呢

我们正在K230开发板上部署YOLO目标检测模型。用户遇到了文件路径问题，需要修改代码中的deploy_config.json和kmodel模型文件的路径。根据引用[3]的示例，我们有一个JSON配置文件（deploy_config.json），其中...

class Unet(nn.Module): def init(self, num_classes): def forward(self, x): return out 我需要的输出结果是这样的,图片按照代码和题目要求输出,包括Original Image Ground Truth Prediction三部分,都要有对应的输出,并且参与测试的图片都要输出,需要补全上述代码,英文输出: 输出结果: Starting training... Epoch 1/20: 100%|██████████| 46/46 [00:15<00:00, 3.04it/s, loss=2.49]Epoch 1/20, Training Loss: 2.8437 Validation Loss: 2.4612 New best model with validation loss: 2.4612 Epoch 2/20: 100%|██████████| 46/46 [00:15<00:00, 3.00it/s, loss=1.59]Epoch 2/20, Training Loss: 2.0684 Validation Loss: 1.5868 New best model with validation loss: 1.5868 Epoch 3/20: 100%|██████████| 46/46 [00:15<00:00, 3.00it/s, loss=1.26]Epoch 3/20, Training Loss: 1.3412 Validation Loss: 1.1896 New best model with validation loss: 1.1896 Epoch 4/20: 100%|██████████| 46/46 [00:15<00:00, 3.02it/s, loss=1.16]Epoch 4/20, Training Loss: 1.0508 Validation Loss: 1.0617 New best model with validation loss: 1.0617 Epoch 5/20: 100%|██████████| 46/46 [00:15<00:00, 2.99it/s, loss=0.812] Epoch 5/20, Training Loss: 0.9584 Validation Loss: 1.0257 New best model with validation loss: 1.0257 Epoch 6/20: 100%|██████████| 46/46 [00:15<00:00, 2.96it/s, loss=0.841]Epoch 6/20, Training Loss: 0.9038 Validation Loss: 1.0027 New best model with validation loss: 1.0027 Epoch 7/20: 100%|██████████| 46/46 [00:16<00:00, 2.84it/s, loss=0.77]Epoch 7/20, Training Loss: 0.8736 Validation Loss: 0.9764 New best model with validation loss: 0.9764 Epoch 8/20: 100%|██████████| 46/46 [00:16<00:00, 2.87it/s, loss=0.809]Epoch 8/20, Training Loss: 0.8373 Validation Loss: 0.9694 New best model with validation loss: 0.9694 Epoch 9/20: 100%|██████████| 46/46 [00:15<00:00, 2.99it/s, loss=1.04]Epoch 9/20, Training Loss: 0.8129 Validation Loss: 0.9442 New best model with validation loss: 0.9442 Epoch 10/20: 100%|██████████| 46/46 [00:15<00:00, 3.00it/s, loss=0.838]Epoch 10/20, Training Loss: 0.7859 Validation Loss: 0.9309 New best model with validation loss: 0.9309 Epoch 11/20: 100%|██████████| 46/46 [00:15<00:00, 3.01it/s, loss=0.799]Epoch 11/20, Training Loss: 0.7673 Validation Loss: 0.9087 New best model with validation loss: 0.9087 Epoch 12/20: 100%|██████████| 46/46 [00:15<00:00, 3.02it/s, loss=0.673]Epoch 12/20, Training Loss: 0.7386 Validation Loss: 0.9185 Epoch 13/20: 100%|██████████| 46/46 [00:15<00:00, 3.00it/s, loss=0.638]Epoch 13/20, Training Loss: 0.6899 Validation Loss: 0.8576 New best model with validation loss: 0.8576 Epoch 14/20: 100%|██████████| 46/46 [00:15<00:00, 3.01it/s, loss=0.553]Epoch 14/20, Training Loss: 0.6538 Validation Loss: 0.8267 New best model with validation loss: 0.8267 Epoch 15/20: 100%|██████████| 46/46 [00:14<00:00, 3.07it/s, loss=0.765] Epoch 15/20, Training Loss: 0.6342 Validation Loss: 0.8240 New best model with validation loss: 0.8240 Epoch 16/20: 100%|██████████| 46/46 [00:15<00:00, 2.99it/s, loss=0.688]Epoch 16/20, Training Loss: 0.6203 Validation Loss: 0.8336 Epoch 17/20: 100%|██████████| 46/46 [00:15<00:00, 2.99it/s, loss=0.518]Epoch 17/20, Training Loss: 0.6099 Validation Loss: 0.8014 New best model with validation loss: 0.8014 Epoch 18/20: 100%|██████████| 46/46 [00:15<00:00, 2.93it/s, loss=0.444]Epoch 18/20, Training Loss: 0.6023 Validation Loss: 0.8169 Epoch 19/20: 100%|██████████| 46/46 [00:15<00:00, 2.98it/s, loss=0.822]Epoch 19/20, Training Loss: 0.5885 Validation Loss: 0.8045 Epoch 20/20: 100%|██████████| 46/46 [00:15<00:00, 2.90it/s, loss=0.425] Epoch 20/20, Training Loss: 0.5659 Validation Loss: 0.7840 New best model with validation loss: 0.7840 Training finished! <ipython-input-5-1f21aef180ff>:213: FutureWarning: You are using torch.load with weights_only=False (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for weights_only will be flipped to True. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via torch.serialization.add_safe_globals. We recommend you start setting weights_only=True for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. model.load_state_dict(torch.load("best_segmentation_model.pth")) Model saved to simple_segmentation_model.pth Visualizing model predictions: 而且还要满足题目要求: Task 1. Implement Unet and train it on the PASCAL VOC dataset The Unet paper is here: https://arxiv.org/pdf/1505.04597 Use any number of tricks that you can You cannot use pretrained models, though (until we learn about transfer learning) You must achieve > 15 mean IOU (the code for evaluation is in the end of the notebook) Grading rubric: mean IOU > 15, 10 points mean 12 < IOU <= 15, 8 points mean 10 <= IOU <= 12, 5 points mean IOU < 10, 0 points Important: you need to achieve 10 and more IOU using all 21 classes from PASCAL VOC In the end of the notebook you must execute the last cell and pass the tests, otherwise you will receive 0. 其中不可修改的代码要保证全部正常输出: import os import numpy as np import matplotlib.pyplot as plt from PIL import Image import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset, DataLoader import torchvision.transforms as transforms import torchvision.transforms.functional as TF import torchvision.models as models from torchvision.datasets import VOCSegmentation from tqdm import tqdm torch.manual_seed(42) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") DATA_DIR = "./data" BATCH_SIZE = 32 NUM_EPOCHS = 20 # Increased to get better results LEARNING_RATE = 0.0001 # Lowered to improve stability IMAGE_SIZE = (224, 224) # PASCAL VOC has 21 classes (including background) NUM_CLASSES = 21 # PASCAL VOC class labels for visualization VOC_CLASSES = [ 'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor' ] # Color map for visualization VOC_COLORMAP = [ [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128] ] class SegmentationTransform: def init(self, size, is_train=False): self.size = size self.is_train = is_train def call(self, image, mask): if self.is_train and np.random.random() > 0.5: image = TF.hflip(image) mask = TF.hflip(mask) if self.is_train and np.random.random() > 0.7: angle = np.random.randint(-10, 10) image = TF.rotate(image, angle, interpolation=Image.BILINEAR) mask = TF.rotate(mask, angle, interpolation=Image.NEAREST) if self.is_train and np.random.random() > 0.7: brightness_factor = np.random.uniform(0.8, 1.2) contrast_factor = np.random.uniform(0.8, 1.2) image = TF.adjust_brightness(image, brightness_factor) image = TF.adjust_contrast(image, contrast_factor) image = TF.resize(image, self.size, interpolation=Image.BILINEAR) mask = TF.resize(mask, self.size, interpolation=Image.NEAREST) image = TF.to_tensor(image) mask_array = np.array(mask) mask_array[mask_array == 255] = 0 # Set ignore pixels to background mask = torch.from_numpy(mask_array).long() # Normalize image image = TF.normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) return image, mask class VOCDatasetWrapper(Dataset): def init(self, dataset, transform=None): self.dataset = dataset self.transform = transform def len(self): return len(self.dataset) def getitem(self, idx): image, mask = self.dataset[idx] if self.transform: image, mask = self.transform(image, mask) return image, mask voc_train = VOCSegmentation(root=DATA_DIR, year='2012', image_set='train', download=True) voc_val = VOCSegmentation(root=DATA_DIR, year='2012', image_set='val', download=True) train_transform = SegmentationTransform(IMAGE_SIZE, is_train=True) val_transform = SegmentationTransform(IMAGE_SIZE, is_train=False) train_dataset = VOCDatasetWrapper(voc_train, train_transform) val_dataset = VOCDatasetWrapper(voc_val, val_transform) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2) # Reduced workers val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) # Reduced workers # Display some examples from the dataset def visualize_examples(dataset, num_examples=3): fig, axes = plt.subplots(num_examples, 2, figsize=(12, 4 * num_examples)) for i in range(num_examples): # Get a sample idx = np.random.randint(0, len(dataset)) image, mask = dataset.dataset[idx] # Original image axes[i, 0].imshow(image) axes[i, 0].set_title(f"Original Image {idx}") axes[i, 0].axis('off') # Colored mask colored_mask = np.zeros((mask.size[1], mask.size[0], 3), dtype=np.uint8) mask_array = np.array(mask) for class_idx, color in enumerate(VOC_COLORMAP): colored_mask[mask_array == class_idx] = color axes[i, 1].imshow(colored_mask) axes[i, 1].set_title(f"Segmentation Mask {idx}") axes[i, 1].axis('off') plt.tight_layout() plt.show() # Visualize examples before training print("Displaying dataset examples:") visualize_examples(train_dataset) import torch def evaluate_segmentation(model, val_loader, num_classes, device='cuda'): model.eval() confusion_matrix = torch.zeros(num_classes, num_classes, dtype=torch.long, device=device) ignore_index = 255 with torch.no_grad(): for images, masks in val_loader: images = images.to(device) masks = masks.to(device) outputs = model(images) preds = torch.argmax(outputs, dim=1) # [B, H, W] preds = preds.view(-1) masks = masks.view(-1) # Filter out ignore pixels valid_mask = (masks != ignore_index) preds = preds[valid_mask] gt = masks[valid_mask] # Vectorized confusion matrix update indices = gt * num_classes + preds # also on the GPU bins = torch.bincount(indices, minlength=num_classes*num_classes) confusion_matrix += bins.reshape(num_classes, num_classes) # Move confusion matrix back to CPU if you need .item() or numpy confusion_matrix = confusion_matrix.cpu() # Compute IoU class_iou = [] for c in range(num_classes): TP = confusion_matrix[c, c].item() FN = confusion_matrix[c, :].sum().item() - TP FP = confusion_matrix[:, c].sum().item() - TP denom = TP + FP + FN if denom == 0: iou_c = float('nan') else: iou_c = TP / denom class_iou.append(iou_c) # mean_iou valid_iou = [x for x in class_iou if not np.isnan(x)] mean_iou = float(np.mean(valid_iou)) if len(valid_iou) > 0 else 0.0 return class_iou, mean_iou class_iou, mean_iou = evaluate_segmentation( model=trained_model, val_loader=val_loader, num_classes=NUM_CLASSES, device=device ) # Print results for i, iou_val in enumerate(class_iou): print(f"Class {i} IoU = {iou_val:.4f}") print(f"Mean IoU over {len(class_iou)} classes = {mean_iou:.4f}") 尤其是这部分一定要保证可以正常输出但不能更改代码: assert mean_iou > 0.10, 'Your IOU must be larger than 10 to get the grade' if mean_iou > 0.15: print('Full grade, 10 points') elif 0.12 < mean_iou <= 0.15: print('Partial grade, 8 points') elif 0.10 < mean_iou <= 0.12: print('Partial grade, 5 points') else: print('IOU is less than 10, 0 points') print('All tests pass!')

out_channels):super().__init__()self.double_conv =nn.Sequential(nn.Conv2d(in_channels,out_channels, kernel_size=3,padding=1),nn.BatchNorm2d(out_channels),nn.ReLU(inplace=True),nn.Conv2d(out_channels, ...

基于Django的网络设备租赁系统设计与实现-z78dv873【附万字论文+PPT+包部署+录制讲解视频】.zip

模型过拟合与欠拟合解决方案.doc

“经典动态规划问题过河卒代码实现”

资源下载链接为： https://pan.quark.cn/s/abbae039bf2a 在棋盘上，A点有一枚过河卒，目标是抵达B点。卒的移动方式仅限于向下或向右行进。然而，棋盘上存在一个对方的马（比如C点的马），马所在的点以及它能一步跳跃到达的所有点都属于对方马的控制范围。以C点的马为例，它能控制9个点（包括C点本身以及图示的P1至P8）。卒在行进过程中，不能经过对方马的任何控制点。

网友提问：数仓ADS层有事实表吗？.md

2025年智启原新：医药和医疗器械企业AI原生转型报告.pdf

看一下修改时间：Image$$VECTOR$$Base

Image\$\$VECTOR\$\$Base Image$$VECTOR$$Base 客服测试

elasticsearch-8.17.0.jar中文-英文对照文档.zip

1、压缩文件中包含：中文-英文对照文档、jar包下载地址、Maven依赖、Gradle依赖、源代码......

tf.image.resize_bilinear

相关推荐

Image_Resize

imageResize

MATLAB_Seam_Carving.zip_The Image_image resize_resize_seam_seam

tf.image.resize_images

tf.image.resize用法

resized_img = img.resize((new_width, new_height), Image.LANCZOS)

基于Django的网络设备租赁系统设计与实现-z78dv873【附万字论文+PPT+包部署+录制讲解视频】.zip

模型过拟合与欠拟合解决方案.doc

“经典动态规划问题过河卒代码实现”

网友提问：数仓ADS层有事实表吗？.md

2025年智启原新：医药和医疗器械企业AI原生转型报告.pdf

看一下修改时间：Image$$VECTOR$$Base

elasticsearch-8.17.0.jar中文-英文对照文档.zip

大家在看

最全的xilinx vivado ip核license

prophecypracticum_django

Autodesk 123d design中文版百度网盘下载 32&64;位

simplorerGSG中文帮助

HA_PandoraRecovery211 数据恢复

最新推荐

基于Django的网络设备租赁系统设计与实现-z78dv873【附万字论文+PPT+包部署+录制讲解视频】.zip

模型过拟合与欠拟合解决方案.doc

“经典动态规划问题过河卒代码实现”

网友提问：数仓ADS层有事实表吗？.md

2025年智启原新：医药和医疗器械企业AI原生转型报告.pdf

PKID查壳工具最新版发布，轻松识别安卓安装包加壳

【PDF.js问题诊断手册】：快速定位与解决常见问题

grep -Ei

一键关闭系统更新的工具介绍

【PC3000高级应用】：复杂故障快速诊断与解决