DeepSpeed为了克服一般Pipeline并行的forward时weights,和backward时计算梯度的weights, 二者不相同的问题,退而求其次,牺牲性能,采用gradient-accumulate方式,backward时只累积梯度至local,并不更新weights;多个micro-batch完成之后,才all-reduce一把并更新weights;
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
...
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
...
nn.Linear(4096, num_classes),
)
def to_layers(self):
layers = [
*self.features,
self.