Skip to content

【开源实习】Parallel课程资料更新 #109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import mindspore.nn as nn
import mindspore as ms
import mindspore.ops as ops

ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")

def weight_variable_0(shape):
"""weight_variable_0"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
import mindspore as ms
from resnet import resnet50

ms.set_context(mode=ms.GRAPH_MODE, device_target="GPU")
init("nccl")
ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
init("hccl")

def create_dataset(data_path, repeat_num=1, batch_size=32, rank_id=0, rank_size=1): # pylint: disable=missing-docstring
resize_height = 224
Expand Down
37 changes: 18 additions & 19 deletions Season1.step_into_chatgpt/5.Parallel/operator_parallel/train.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Operator Parallel Example"""
import sys
import os # 用于获取环境变量
import numpy as np

import mindspore as ms
from mindspore.nn import Cell, Momentum
from mindspore.ops import operations as ops
Expand All @@ -11,12 +9,12 @@
import mindspore.communication as D
from mindspore.common.initializer import initializer

# 获取设备数量的环境变量,如果没有设置,默认为1
devices = int(os.getenv('DEVICE_NUM', 2)) # 从环境变量 DEVICE_NUM 获取设备数

args = sys.argv
devices = int(args[1])

if devices < 1 and devices > 8:
print('device_num error')
# 验证设备数的有效性
if devices < 1 or devices > 8:
print('Invalid number of devices. Exiting...')
exit(0)

step_per_epoch = 4
Expand All @@ -27,38 +25,39 @@ def generate():
yield inputs
return generate


class Net(Cell):
"""define net"""
"""定义网络"""
def __init__(self):
super().__init__()
self.matmul = ops.MatMul().shard(((1, 2), (2, 1)))
self.weight = ms.Parameter(initializer("normal", [32, 16]), "w1")
self.relu = ops.ReLU().shard(((2, 1),))
self.matmul = ops.MatMul().shard(((1, 2), (2, 1))) # 分配计算
self.weight = ms.Parameter(initializer("normal", [32, 16]), "w1") # 权重初始化
self.relu = ops.ReLU().shard(((2, 1),)) # ReLU 激活函数

def construct(self, x):
out = self.matmul(x, self.weight)
out = self.relu(out)
out = self.matmul(x, self.weight) # 矩阵乘法
out = self.relu(out) # 激活函数
return out


if __name__ == "__main__":
ms.set_context(mode=ms.GRAPH_MODE)
ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") # 设置设备为 Ascend NPU
D.init()
rank = D.get_rank()
ms.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=devices, full_batch=True)
rank = D.get_rank() # 获取当前进程的 rank
ms.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=devices, full_batch=True) # 设置并行上下文

# 数据集生成
np.random.seed(1)
input_data = np.random.rand(16, 32).astype(np.float32)
label_data = np.random.rand(16, 16).astype(np.float32)
fake_dataset = get_dataset(input_data, label_data)

net = Net()

# 回调函数
callback = [train.LossMonitor(), train.ModelCheckpoint(directory="{}".format(rank))]
dataset = ds.GeneratorDataset(fake_dataset, ["input", "label"])
loss = SoftmaxCrossEntropyWithLogits()

# 优化器
learning_rate = 0.001
momentum = 0.1
epoch_size = 5
Expand Down
Original file line number Diff line number Diff line change
@@ -1,38 +1,50 @@
"""Parallel Optimizer Fusion Example"""
from mindspore.communication import init
from mindspore import nn
import mindspore as ms

# Initialize communication for distributed training
init()

ms.set_context(mode=ms.GRAPH_MODE)
ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, enable_parallel_optimizer=True)
# Set MindSpore context to GRAPH_MODE for better performance
ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend") # Use Ascend for NPU

# Enable parallel optimizer and set parallel mode
ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL,
enable_parallel_optimizer=True, device_num=2)

class DenseLayer(nn.Cell):
"""A base layer with two dense layer"""
"""A base layer with two dense layers"""
def __init__(self):
super().__init__()
self.input_mapping = nn.Dense(10, 10)
self.output_mapping = nn.Dense(10, 10)

def construct(self, x):
x = self.input_mapping(x)
return self.output_mapping(x)

class Net(nn.Cell):
"""An network with many dense layers"""
"""A network with many dense layers"""
def __init__(self):
super().__init__()
self.layer1 = DenseLayer()
self.layer2 = DenseLayer()
self.layer3 = DenseLayer()

# Set communication fusion for each layer
self.layer1.set_comm_fusion(0)
self.layer2.set_comm_fusion(1)
self.layer3.set_comm_fusion(2)

def construct(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
return x

# Instantiate the model
net = Net()

# Print the communication fusion id for each trainable parameter
for item in net.trainable_params():
print(f"The parameter {item.name}'s fusion id is {item.comm_fusion}")
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import mindspore.nn as nn
import mindspore as ms
import mindspore.ops as ops

from mindspore.communication import init

def weight_variable_0(shape):
"""weight_variable_0"""
Expand Down Expand Up @@ -319,7 +319,12 @@ def construct(self, x):
x = self.squeeze(x)
x = self.fc(x)
return x
# Set context for Ascend NPU
ms.set_context(mode=ms.GRAPH_MODE, device_target="Ascend")
ms.set_auto_parallel_context(parallel_mode=ms.ParallelMode.SEMI_AUTO_PARALLEL, enable_parallel_optimizer=True, device_num=2)

# Initialize communication (for distributed training, if necessary)
init()

def resnet50(batch_size, num_classes):
"""create resnet50"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from resnet import resnet50

device_target = os.getenv('DEVICE_TARGET')
ms.set_context(mode=ms.GRAPH_MODE, device_target=device_target)
ms.set_context(mode=ms.GRAPH_MODE, device_target='Ascend')

if device_target == "Ascend":
device_id = int(os.getenv('DEVICE_ID'))
ms.set_context(device_id=device_id)
Expand Down
Loading