[paper]Group Normalization

Group Normalization(GN)由Kaiming He和Yuxin Wu在2018年提出,旨在解决Batch Normalization(BN)在小batch size下性能下降的问题。BN依赖于batch size,而在目标检测、分割等任务中,batch size通常很小。GN通过将通道分组进行归一化,提供了一种与batch size无关的解决方案,从而在各种任务中保持稳定性能。

Group Normalization

Group Normalization 在18年由Kaiming He和 Yuxin Wu提出,其目的是为了解决BN存在的缺陷,在某些情况下用GN替代BN。

BN存在的问题

BN全名Batch Normalization,在15年提出后,广泛应用于深度学习领域,其作用主要在于提升训练速度和收敛速度。BN以batch为维度进行归一化。

看了BN之后的理解详见

BN存在的问题是它依赖于batch size的大小。通常需要的batch size大小是32。一个小的batch size会导致其性能下降。但是对于目标检测,分割,视频识别等任务,由于内存的限制,通常batch size的大小为1-2。具体性能差距详见下图:
还在路上,稍等...

另外,Batch Normalization是在batch这个维度上Normalization,但是这个维度并不是固定不变的,比如训练和测试时一般不一样,一般都是训练的时候在训练集上通过滑动平均预先计算好平均-mean,和方差-variance参数,在测试的时候,不在计算这些值,而是直接调用这些预计算好的来用,但是,当训练数据和测试数据分布有差别是时,训练机上预计算好的数据并不能代表测试数据,这就导致在训练,验证,测试这三个阶段存在inconsistency。

Group Normalization

import os import pandas as pd import numpy as np import networkx as nx import matplotlib.pyplot as plt import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers zip_file = keras.utils.get_file( fname="cora.tgz",#保存的文件名 origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",#下载路径 extract=True,#下载后解压 ) data_dir = os.path.join(os.path.dirname(zip_file), "cora")#获取解压后的路径再与cora拼接 #加载引用关系文件,相当于图中的边 citations = pd.read_csv( os.path.join(data_dir, "cora.cites"),#拼接路径 sep="\t",#以制表符作为分割 header=None,#无表头 names=["target", "source"], ) print("Citations shape:", citations.shape) citations.sample(frac=1).head()#将原本的所有数据打乱然后取前五个 #定义特征数据 column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"] #加载论文数据,相当于图中的节点 papers = pd.read_csv( os.path.join(data_dir, "cora.content"), sep="\t", header=None, names=column_names, ) print("Papers shape:", papers.shape) print(papers.sample(5).T) print(papers.subject.value_counts()) class_values = sorted(papers["subject"].unique())#获取到类别的数量 class_idx = {name: id for id, name in enumerate(class_values)}#根据论文类别建立索引 paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}#根据论文id建立索引 #使用索引建立映射 papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name]) citations["source"] = citations["source"].apply(lambda name: paper_idx[name]) citations["target"] = citations["target"].apply(lambda name: paper_idx[name]) papers["subject"] = papers["subject"].apply(lambda value: class_idx[value]) plt.figure(figsize=(10, 10)) colors = papers["subject"].tolist() cora_graph = nx.from_pandas_edgelist(citations.sample(n=1500))#随机选取1500条数据并转化为图的结构 subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])#获取到图中节点的论文类别 nx.draw_spring(cora_graph, node_size=15, node_color=subjects) train_data, test_data = [], [] #按照论文类别分组,且每个类别大约半的数据在训练集,另一半在测试集。 for _, group_data in papers.groupby("subject"): random_selection = np.random.rand(len(group_data.index)) <= 0.5 train_data.append(group_data[random_selection]) test_data.append(group_data[~random_selection]) #将所有类别的训练集和测试集分别合并并且打乱数据 train_data = pd.concat(train_data).sample(frac=1) test_data = pd.concat(test_data).sample(frac=1) print("Train data shape:", train_data.shape) print("Test data shape:", test_data.shape) hidden_units = [32, 32]#隐藏层:邻居聚合和消息传递 learning_rate = 0.01 dropout_rate = 0.5 num_epochs = 300 batch_size = 256 #训练和评估模型 def run_experiment(model, x_train, y_train): model.compile( optimizer=keras.optimizers.Adam(learning_rate),#优化器 loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),#损失函数 metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],#分类准确率 ) # 早停回调:防止过拟合,当验证集的准确率不再提升时停止训练. early_stopping = keras.callbacks.EarlyStopping( monitor="val_acc", #监听验证集的准确率 patience=50,#允许验证集准确率不提升的最大轮数 restore_best_weights=True#回到最佳模型 ) #模型训练 history = model.fit( x=x_train,#训练数据 y=y_train,#训练标签 epochs=num_epochs, batch_size=batch_size, validation_split=0.15,#训练集划分15%为验证集 callbacks=[early_stopping],#启用早停 ) return history #数据可视化 def display_learning_curves(history): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) ax1.plot(history.history["loss"]) ax1.plot(history.history["val_loss"]) ax1.legend(["train", "val"], loc="upper right") ax1.set_xlabel("Epochs") ax1.set_ylabel("Loss") ax2.plot(history.history["acc"]) ax2.plot(history.history["val_acc"]) ax2.legend(["train", "test"], loc="upper right") ax2.set_xlabel("Epochs") ax2.set_ylabel("Accuracy") plt.show() #前馈神经网络:输入->隐藏层->输出 def create_ffn(hidden_units, dropout_rate, name=None): fnn_layers = [] #隐藏层结构 for units in hidden_units: fnn_layers.append(layers.BatchNormalization())#批归一化 fnn_layers.append(layers.Dropout(dropout_rate))#丢弃 fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))#全连接 return keras.Sequential(fnn_layers, name=name) feature_names = set(papers.columns) - {"paper_id", "subject"}#获取特征列名 num_features = len(feature_names)#特征数量 num_classes = len(class_idx)#类别数量 #将训练和测试的的特征转化为numpy x_train = train_data[feature_names].to_numpy() x_test = test_data[feature_names].to_numpy() #训练和测试的标签 y_train = train_data["subject"] y_test = test_data["subject"] #带有跳跃连接的前反馈神经网络的基线模型 def create_baseline_model(hidden_units, num_classes, dropout_rate=0.2): inputs = layers.Input(shape=(num_features,), name="input_features") x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs)#对输入数据进行非线性变化,进行特征提取。 #跳跃连接:解决深层前馈网络易出现梯度消失或训练不稳定。 for block_idx in range(4): #创造一个新的fnn x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x) #跳跃链接 x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1]) #输出 logits = layers.Dense(num_classes, name="logits")(x) #返回模型 return keras.Model(inputs=inputs, outputs=logits, name="baseline") #创建模型 baseline_model = create_baseline_model(hidden_units, num_classes, dropout_rate) baseline_model.summary() #训练模型 history = run_experiment(baseline_model, x_train, y_train) #可视化展示 display_learning_curves(history) #测试结果 _, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0) print(f"Test accuracy: {round(test_accuracy * 100, 2)}%") #生成随机实例 def generate_random_instances(num_instances): token_probability = x_train.mean(axis=0)#特征全局出现的概率 instances = [] for _ in range(num_instances): probabilities = np.random.uniform(size=len(token_probability))#生成随机数 instance = (probabilities <= token_probability).astype(int) instances.append(instance) return np.array(instances) def display_class_probabilities(probabilities): for instance_idx, probs in enumerate(probabilities): print(f"Instance {instance_idx + 1}:") for class_idx, prob in enumerate(probs): print(f"- {class_values[class_idx]}: {round(prob * 100, 2)}%") new_instances = generate_random_instances(num_classes) logits = baseline_model.predict(new_instances)#获取模型输出 probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()#转为概率 display_class_probabilities(probabilities) #边 edges = citations[["source", "target"]].to_numpy().T #边权重 edge_weights = tf.ones(shape=edges.shape[1]) #点特征 node_features = tf.cast( papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32 ) #图 graph_info = (node_features, edges, edge_weights) print("Edges shape:", edges.shape) print("Nodes shape:", node_features.shape) class GraphConvLayer(layers.Layer): def __init__( self, hidden_units, dropout_rate=0.2, aggregation_type="mean", combination_type="concat",#自身特征和聚合特征的结合方式:拼接 normalize=False, *args, **kwargs, ): super(GraphConvLayer, self).__init__(*args, **kwargs) self.aggregation_type = aggregation_type self.combination_type = combination_type self.normalize = normalize self.ffn_prepare = create_ffn(hidden_units, dropout_rate) if self.combination_type == "gated": #门控制单元GRU self.update_fn = layers.GRU( units=hidden_units, activation="tanh", recurrent_activation="sigmoid", dropout=dropout_rate, return_state=True, recurrent_dropout=dropout_rate, ) else: self.update_fn = create_ffn(hidden_units, dropout_rate) #消息准备 def prepare(self, node_repesentations, weights=None): #对消息进行非线性变换 messages = self.ffn_prepare(node_repesentations) #有权重则加权计算 if weights is not None: messages = messages * tf.expand_dims(weights, -1) return messages #消息聚合 """def aggregate(self, node_indices, neighbour_messages, node_repesentations): #node_indices表示接收聚合消息的目标节点 num_nodes = node_repesentations.shape[0] if self.aggregation_type == "sum": aggregated_message = tf.math.unsorted_segment_sum( neighbour_messages, node_indices, num_segments=num_nodes #参数分别代表邻居消息,分组索引,分组数量 ) elif self.aggregation_type == "mean": aggregated_message = tf.math.unsorted_segment_mean( neighbour_messages, node_indices, num_segments=num_nodes ) elif self.aggregation_type == "max": aggregated_message = tf.math.unsorted_segment_max( neighbour_messages, node_indices, num_segments=num_nodes ) else: raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.") return aggregated_message""" def aggregate(self, node_indices, neighbour_messages, node_repesentations): # 计算度矩阵(假设edge_weights已包含自环) num_nodes = node_repesentations.shape[0] degrees = tf.math.unsorted_segment_sum( tf.ones_like(edge_weights), node_indices, num_segments=num_nodes ) D_inv_sqrt = tf.pow(degrees + 1e-7, -0.5) # 避免除零 # 获取归一化权重 norm_weights = D_inv_sqrt * edge_weights * tf.gather(D_inv_sqrt, node_indices) # 加权聚合 aggregated_message = tf.math.unsorted_segment_sum( neighbour_messages * tf.expand_dims(norm_weights, -1), node_indices, num_segments=num_nodes) return aggregated_message #消息更新 def update(self, node_repesentations, aggregated_messages): if self.combination_type == "gru": h = tf.stack([node_repesentations, aggregated_messages], axis=1) elif self.combination_type == "concat": h = tf.concat([node_repesentations, aggregated_messages], axis=1) elif self.combination_type == "add": h = node_repesentations + aggregated_messages else: raise ValueError(f"Invalid combination type: {self.combination_type}.") #特征变换 node_embeddings = self.update_fn(h) if self.combination_type == "gru": node_embeddings = tf.unstack(node_embeddings, axis=1)[-1] #归一化 if self.normalize: node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1) return node_embeddings #消息传递 def call(self, inputs): node_repesentations, edges, edge_weights = inputs edges = tf.cast(edges, dtype=tf.int32) edge_weights = tf.cast(edge_weights, dtype=tf.float32) #源节点和目标节点 node_indices, neighbour_indices = edges[0], edges[1] #聚合邻居的消息 neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices) #邻居消息准备 neighbour_messages = self.prepare(neighbour_repesentations, edge_weights) #消息聚合 aggregated_messages = self.aggregate( node_indices, neighbour_messages, node_repesentations ) #更新 return self.update(node_repesentations, aggregated_messages) class GNNNodeClassifier(tf.keras.Model): def __init__( self, graph_info, num_classes, hidden_units, aggregation_type="sum", combination_type="concat", dropout_rate=0.2, normalize=True, *args, **kwargs, ): super(GNNNodeClassifier, self).__init__(*args, **kwargs) node_features, edges, edge_weights = graph_info self.node_features = tf.convert_to_tensor(graph_info[0], dtype=tf.float32) self.edges = tf.convert_to_tensor(graph_info[1], dtype=tf.int32) self.edge_weights = tf.convert_to_tensor(graph_info[2], dtype=tf.float32) # 设置权重为1 if self.edge_weights is None: self.edge_weights = tf.ones(shape=edges.shape[1]) #权重和为一 self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights) #预处理:对原始节点特征进行非线性变化 self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess") #第一个图卷积层 self.conv1 = GraphConvLayer( hidden_units, dropout_rate, aggregation_type, combination_type, normalize, name="graph_conv1", ) #第二个 self.conv2 = GraphConvLayer( hidden_units, dropout_rate, aggregation_type, combination_type, normalize, name="graph_conv2", ) #后处理层 self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess") #逻辑输出层 self.compute_logits = layers.Dense(units=num_classes, name="logits") def call(self, input_node_indices): #预处理 x = self.preprocess(self.node_features) #第一个图卷积层 x1 = self.conv1((x, self.edges, self.edge_weights)) x = x1 + x #第二个图卷积层 x2 = self.conv2((x, self.edges, self.edge_weights)) x = x2 + x #后处理 x = self.postprocess(x) node_embeddings = tf.gather(x, input_node_indices) return self.compute_logits(node_embeddings) gnn_model = GNNNodeClassifier( graph_info=graph_info, num_classes=num_classes, hidden_units=hidden_units, dropout_rate=dropout_rate, name="gnn_model", ) print("GNN output shape:", gnn_model([1, 10, 100])) gnn_model.summary() x_train = train_data.paper_id.to_numpy() history = run_experiment(gnn_model, x_train, y_train) display_learning_curves(history) x_test = test_data.paper_id.to_numpy() _, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0) print(f"Test accuracy: {round(test_accuracy * 100, 2)}%") #加入新节点 num_nodes = node_features.shape[0] new_node_features = np.concatenate([node_features, new_instances]) #为新节点加入边 new_node_indices = [i + num_nodes for i in range(num_classes)] new_citations = [] #引用关系 for subject_idx, group in papers.groupby("subject"): subject_papers = list(group.paper_id) #从当前学科选五篇 selected_paper_indices1 = np.random.choice(subject_papers, 5) #从所有论文中选两篇 selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2) #合并选中的id selected_paper_indices = np.concatenate( [selected_paper_indices1, selected_paper_indices2], axis=0 ) #添加边 citing_paper_indx = new_node_indices[subject_idx] for cited_paper_idx in selected_paper_indices: new_citations.append([citing_paper_indx, cited_paper_idx]) new_citations = np.array(new_citations).T new_edges = np.concatenate([edges, new_citations], axis=1)#合并新旧边 print("Original node_features shape:", gnn_model.node_features.shape) print("Original edges shape:", gnn_model.edges.shape) #更新图 gnn_model.node_features = new_node_features gnn_model.edges = new_edges gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1]) print("New node_features shape:", gnn_model.node_features.shape) print("New edges shape:", gnn_model.edges.shape) logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))#输出 probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()#概率 display_class_probabilities(probabilities)报错ensorflow.python.framework.errors_impl.InvalidArgumentError: Exception encountered when calling layer 'graph_conv1' (type GraphConvLayer). {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [2708] vs. [5429] [Op:Mul] Call arguments received by layer 'graph_conv1' (type GraphConvLayer): • inputs=('tf.Tensor(shape=(2708, 32), dtype=float32)', 'tf.Tensor(shape=(2, 5429), dtype=int32)', 'tf.Tensor(shape=(5429,), dtype=float32)')
08-13
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值