@@ -0,0 +1,33 @@ | |||
# Copyright 2020 Huawei Technologies Co., Ltd | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ============================================================================ | |||
""" | |||
network config setting, will be used in train.py | |||
""" | |||
from easydict import EasyDict as edict | |||
mnist_cfg = edict({ | |||
'num_classes': 10, | |||
'lr': 0.01, | |||
'momentum': 0.9, | |||
'epoch_size': 10, | |||
'batch_size': 32, | |||
'buffer_size': 1000, | |||
'image_height': 32, | |||
'image_width': 32, | |||
'save_checkpoint_steps': 1875, | |||
'keep_checkpoint_max': 10, | |||
'air_name': "lenet", | |||
}) |
@@ -0,0 +1,60 @@ | |||
# Copyright 2020 Huawei Technologies Co., Ltd | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ============================================================================ | |||
""" | |||
Produce the dataset | |||
""" | |||
import mindspore.dataset as ds | |||
import mindspore.dataset.vision.c_transforms as CV | |||
import mindspore.dataset.transforms.c_transforms as C | |||
from mindspore.dataset.vision import Inter | |||
from mindspore.common import dtype as mstype | |||
def create_dataset(data_path, batch_size=32, repeat_size=1, | |||
num_parallel_workers=1): | |||
""" | |||
create dataset for train or test | |||
""" | |||
# define dataset | |||
mnist_ds = ds.MnistDataset(data_path) | |||
resize_height, resize_width = 32, 32 | |||
rescale = 1.0 / 255.0 | |||
shift = 0.0 | |||
rescale_nml = 1 / 0.3081 | |||
shift_nml = -1 * 0.1307 / 0.3081 | |||
# define map operations | |||
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode | |||
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) | |||
rescale_op = CV.Rescale(rescale, shift) | |||
hwc2chw_op = CV.HWC2CHW() | |||
type_cast_op = C.TypeCast(mstype.int32) | |||
# apply map operations on images | |||
mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) | |||
mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) | |||
# apply DatasetOps | |||
buffer_size = 10000 | |||
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) # 10000 as in LeNet train script | |||
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) | |||
mnist_ds = mnist_ds.repeat(repeat_size) | |||
return mnist_ds |
@@ -0,0 +1,64 @@ | |||
# Copyright 2020 Huawei Technologies Co., Ltd | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ============================================================================ | |||
""" | |||
######################## eval lenet example ######################## | |||
eval lenet according to model file: | |||
python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt | |||
""" | |||
# python eval.py --device_target=CPU --ckpt_path=./ckpt/checkpoint_lenet-1_1875.ckpt > log_eval.txt 2>&1 | |||
# python eval.py --device_target=CPU --ckpt_path=./ckpt/checkpoint_lenet-10_1875.ckpt > log_eval2.txt 2>&1 | |||
import os | |||
import argparse | |||
import mindspore.nn as nn | |||
from mindspore import context | |||
from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
from mindspore.train import Model | |||
from mindspore.nn.metrics import Accuracy | |||
from dataset import create_dataset | |||
from config import mnist_cfg as cfg | |||
from lenet import LeNet5 | |||
if __name__ == "__main__": | |||
parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||
help='device where the code will be implemented (default: Ascend)') | |||
parser.add_argument('--data_path', type=str, default="./Data", | |||
help='path where the dataset is saved') | |||
parser.add_argument('--ckpt_path', type=str, default="", help='if mode is test, must provide\ | |||
path where the trained ckpt file') | |||
args = parser.parse_args() | |||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
network = LeNet5(cfg.num_classes) | |||
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
repeat_size = cfg.epoch_size | |||
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
print("============== Starting Testing ==============") | |||
param_dict = load_checkpoint(args.ckpt_path) | |||
load_param_into_net(network, param_dict) | |||
ds_eval = create_dataset(os.path.join(args.data_path, "test"), | |||
cfg.batch_size, | |||
1) | |||
if ds_eval.get_dataset_size() == 0: | |||
raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") | |||
acc = model.eval(ds_eval) | |||
print("============== {} ==============".format(acc)) |
@@ -0,0 +1,155 @@ | |||
""" | |||
######################## inference lenet example ######################## | |||
inference lenet according to model file | |||
""" | |||
""" | |||
######################## 推理环境使用说明 ######################## | |||
2、在推理环境中,需要将数据集从obs拷贝到推理镜像中,推理完以后,需要将输出的结果拷贝到obs. | |||
(1)将数据集从obs拷贝到推理镜像中: | |||
obs_data_url = args.data_url | |||
args.data_url = '/home/work/user-job-dir/data/' | |||
if not os.path.exists(args.data_url): | |||
os.mkdir(args.data_url) | |||
try: | |||
mox.file.copy_parallel(obs_data_url, args.data_url) | |||
print("Successfully Download {} to {}".format(obs_data_url, | |||
args.data_url)) | |||
except Exception as e: | |||
print('moxing download {} to {} failed: '.format( | |||
obs_data_url, args.data_url) + str(e)) | |||
(2)将模型文件从obs拷贝到推理镜像中: | |||
obs_ckpt_url = args.ckpt_url | |||
args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt' | |||
try: | |||
mox.file.copy(obs_ckpt_url, args.ckpt_url) | |||
print("Successfully Download {} to {}".format(obs_ckpt_url, | |||
args.ckpt_url)) | |||
except Exception as e: | |||
print('moxing download {} to {} failed: '.format( | |||
obs_ckpt_url, args.ckpt_url) + str(e)) | |||
(3)将输出的模型拷贝回obs: | |||
obs_result_url = args.result_url | |||
args.result_url = '/home/work/user-job-dir/result/' | |||
if not os.path.exists(args.result_url): | |||
os.mkdir(args.result_url) | |||
try: | |||
mox.file.copy_parallel(args.result_url, obs_result_url) | |||
print("Successfully Upload {} to {}".format(args.result_url, obs_result_url)) | |||
except Exception as e: | |||
print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e)) | |||
""" | |||
import os | |||
import argparse | |||
import moxing as mox | |||
import mindspore.nn as nn | |||
from mindspore import context | |||
from mindspore.train.serialization import load_checkpoint, load_param_into_net | |||
from mindspore.train import Model | |||
from mindspore.nn.metrics import Accuracy | |||
from mindspore import Tensor | |||
import numpy as np | |||
from glob import glob | |||
from dataset import create_dataset | |||
from config import mnist_cfg as cfg | |||
from lenet import LeNet5 | |||
if __name__ == "__main__": | |||
parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], | |||
help='device where the code will be implemented (default: Ascend)') | |||
parser.add_argument('--data_url', | |||
type=str, | |||
default="./Data", | |||
help='path where the dataset is saved') | |||
parser.add_argument('--ckpt_url', | |||
help='model to save/load', | |||
default='./ckpt_url') | |||
parser.add_argument('--result_url', | |||
help='result folder to save/load', | |||
default='./result') | |||
args = parser.parse_args() | |||
#将数据集从obs拷贝到推理镜像中: | |||
obs_data_url = args.data_url | |||
args.data_url = '/home/work/user-job-dir/data/' | |||
if not os.path.exists(args.data_url): | |||
os.mkdir(args.data_url) | |||
try: | |||
mox.file.copy_parallel(obs_data_url, args.data_url) | |||
print("Successfully Download {} to {}".format(obs_data_url, | |||
args.data_url)) | |||
except Exception as e: | |||
print('moxing download {} to {} failed: '.format( | |||
obs_data_url, args.data_url) + str(e)) | |||
#对文件夹进行操作,请使用mox.file.copy_parallel。如果拷贝一个文件。请使用mox.file.copy对文件操作 | |||
#将模型文件从obs拷贝到推理镜像中: | |||
obs_ckpt_url = args.ckpt_url | |||
args.ckpt_url = '/home/work/user-job-dir/checkpoint.ckpt' | |||
try: | |||
mox.file.copy(obs_ckpt_url, args.ckpt_url) | |||
print("Successfully Download {} to {}".format(obs_ckpt_url, | |||
args.ckpt_url)) | |||
except Exception as e: | |||
print('moxing download {} to {} failed: '.format( | |||
obs_ckpt_url, args.ckpt_url) + str(e)) | |||
#设置输出路径result_url | |||
obs_result_url = args.result_url | |||
args.result_url = '/home/work/user-job-dir/result/' | |||
if not os.path.exists(args.result_url): | |||
os.mkdir(args.result_url) | |||
args.dataset_path = args.data_url | |||
args.save_checkpoint_path = args.ckpt_url | |||
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) | |||
network = LeNet5(cfg.num_classes) | |||
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
repeat_size = cfg.epoch_size | |||
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) | |||
print("============== Starting Testing ==============") | |||
args.load_ckpt_url = os.path.join(args.save_checkpoint_path) | |||
print("args.load_ckpt_url is:{}", args.load_ckpt_url ) | |||
param_dict = load_checkpoint(args.load_ckpt_url ) | |||
load_param_into_net(network, param_dict) | |||
# 定义测试数据集,batch_size设置为1,则取出一张图片 | |||
ds_test = create_dataset(os.path.join(args.dataset_path, "test"), batch_size=1).create_dict_iterator() | |||
data = next(ds_test) | |||
# images为测试图片,labels为测试图片的实际分类 | |||
images = data["image"].asnumpy() | |||
labels = data["label"].asnumpy() | |||
print('Tensor:', Tensor(data['image'])) | |||
# 使用函数model.predict预测image对应分类 | |||
output = model.predict(Tensor(data['image'])) | |||
predicted = np.argmax(output.asnumpy(), axis=1) | |||
pred = np.argmax(output.asnumpy(), axis=1) | |||
print('predicted:', predicted) | |||
print('pred:', pred) | |||
# 输出预测分类与实际分类,并输出到result_url | |||
print(f'Predicted: "{predicted[0]}", Actual: "{labels[0]}"') | |||
filename = 'result.txt' | |||
file_path = os.path.join(args.result_url, filename) | |||
with open(file_path, 'a+') as file: | |||
file.write(" {}: {:.2f} \n".format("Predicted", predicted[0])) | |||
# Upload results to obs | |||
######################## 将输出的结果拷贝到obs(固定写法) ######################## | |||
# 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载 | |||
try: | |||
mox.file.copy_parallel(args.result_url, obs_result_url) | |||
print("Successfully Upload {} to {}".format(args.result_url, obs_result_url)) | |||
except Exception as e: | |||
print('moxing upload {} to {} failed: '.format(args.result_url, obs_result_url) + str(e)) | |||
######################## 将输出的模型拷贝到obs ######################## |
@@ -0,0 +1,60 @@ | |||
# Copyright 2020 Huawei Technologies Co., Ltd | |||
# | |||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||
# you may not use this file except in compliance with the License. | |||
# You may obtain a copy of the License at | |||
# | |||
# http://www.apache.org/licenses/LICENSE-2.0 | |||
# | |||
# Unless required by applicable law or agreed to in writing, software | |||
# distributed under the License is distributed on an "AS IS" BASIS, | |||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
# See the License for the specific language governing permissions and | |||
# limitations under the License. | |||
# ============================================================================ | |||
"""LeNet.""" | |||
import mindspore.nn as nn | |||
from mindspore.common.initializer import Normal | |||
class LeNet5(nn.Cell): | |||
""" | |||
Lenet network | |||
Args: | |||
num_class (int): Number of classes. Default: 10. | |||
num_channel (int): Number of channels. Default: 1. | |||
Returns: | |||
Tensor, output tensor | |||
Examples: | |||
>>> LeNet(num_class=10) | |||
""" | |||
def __init__(self, num_class=10, num_channel=1, include_top=True): | |||
super(LeNet5, self).__init__() | |||
self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') | |||
self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') | |||
self.relu = nn.ReLU() | |||
self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) | |||
self.include_top = include_top | |||
if self.include_top: | |||
self.flatten = nn.Flatten() | |||
self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) | |||
self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) | |||
self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) | |||
def construct(self, x): | |||
x = self.conv1(x) | |||
x = self.relu(x) | |||
x = self.max_pool2d(x) | |||
x = self.conv2(x) | |||
x = self.relu(x) | |||
x = self.max_pool2d(x) | |||
if not self.include_top: | |||
return x | |||
x = self.flatten(x) | |||
x = self.relu(self.fc1(x)) | |||
x = self.relu(self.fc2(x)) | |||
x = self.fc3(x) | |||
return x |
@@ -0,0 +1,161 @@ | |||
""" | |||
######################## train lenet example ######################## | |||
train lenet and get network model files(.ckpt) | |||
""" | |||
""" | |||
######################## 训练环境使用说明 ######################## | |||
假设已经使用Ascend NPU调试环境调试完代码,欲将调试环境的代码迁移到训练环境进行训练,需要做以下工作: | |||
1、调试环境的镜像和训练环境的镜像是两个不同的镜像,所处的运行目录不一致,需要将data_url和train_url的路径进行变换 | |||
在调试环境中: | |||
args.data_url = '/home/ma-user/work/data/' //数据集位置 | |||
args.train_url = '/home/ma-user/work/model/' //训练输出的模型位置 | |||
在训练环境变换为: | |||
args.data_url = '/home/work/user-job-dir/inputs/data/' | |||
args.train_url = '/home/work/user-job-dir/outputs/model/' | |||
2、在训练环境中,需要将数据集从obs拷贝到训练镜像中,训练完以后,需要将输出的模型拷贝到obs. | |||
将数据集从obs拷贝到训练镜像中: | |||
try: | |||
mox.file.copy_parallel(obs_data_url, args.data_url) | |||
print("Successfully Download {} to {}".format(obs_data_url, | |||
args.data_url)) | |||
except Exception as e: | |||
print('moxing download {} to {} failed: '.format( | |||
obs_data_url, args.data_url) + str(e)) | |||
将输出的模型拷贝到obs: | |||
try: | |||
mox.file.copy_parallel(args.train_url, obs_train_url) | |||
print("Successfully Upload {} to {}".format(args.train_url, | |||
obs_train_url)) | |||
except Exception as e: | |||
print('moxing upload {} to {} failed: '.format(args.train_url, | |||
obs_train_url) + str(e)) | |||
""" | |||
import os | |||
import argparse | |||
import moxing as mox | |||
from config import mnist_cfg as cfg | |||
from dataset import create_dataset | |||
from lenet import LeNet5 | |||
import mindspore.nn as nn | |||
from mindspore import context | |||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor | |||
from mindspore.train import Model | |||
from mindspore.nn.metrics import Accuracy | |||
from mindspore.common import set_seed | |||
parser = argparse.ArgumentParser(description='MindSpore Lenet Example') | |||
# define 2 parameters for running on modelArts | |||
# data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径 | |||
parser.add_argument('--data_url', | |||
help='path to training/inference dataset folder', | |||
default='./data') | |||
parser.add_argument('--train_url', | |||
help='model folder to save/load', | |||
default='./model') | |||
parser.add_argument( | |||
'--device_target', | |||
type=str, | |||
default="Ascend", | |||
choices=['Ascend', 'GPU', 'CPU'], | |||
help='device where the code will be implemented (default: Ascend)') | |||
#用户可自定义的参数,dataset_path在示例中指向data_url,save_checkpoint_path指向train_url,在添加超参数时这两个参数可不添加,modelarts已经默认使用data_url和train_url | |||
parser.add_argument('--dataset_path', | |||
type=str, | |||
default="./Data", | |||
help='path where the dataset is saved') | |||
parser.add_argument('--save_checkpoint_path', | |||
type=str, | |||
default="./ckpt", | |||
help='if is test, must provide\ | |||
path where the trained ckpt file') | |||
parser.add_argument('--epoch_size', | |||
type=int, | |||
default=5, | |||
help='Training epochs.') | |||
set_seed(1) | |||
if __name__ == "__main__": | |||
args = parser.parse_args() | |||
######################## 将数据集从obs拷贝到训练镜像中 (固定写法)######################## | |||
# 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径 | |||
obs_data_url = args.data_url | |||
args.data_url = '/home/work/user-job-dir/inputs/data/' | |||
obs_train_url = args.train_url | |||
args.train_url = '/home/work/user-job-dir/outputs/model/' | |||
try: | |||
mox.file.copy_parallel(obs_data_url, args.data_url) | |||
print("Successfully Download {} to {}".format(obs_data_url, | |||
args.data_url)) | |||
except Exception as e: | |||
print('moxing download {} to {} failed: '.format( | |||
obs_data_url, args.data_url) + str(e)) | |||
######################## 将数据集从obs拷贝到训练镜像中 ######################## | |||
#将dataset_path指向data_url,save_checkpoint_path指向train_url | |||
args.dataset_path = args.data_url | |||
args.save_checkpoint_path = args.train_url | |||
context.set_context(mode=context.GRAPH_MODE, | |||
device_target=args.device_target) | |||
#创建数据集 | |||
ds_train = create_dataset(os.path.join(args.dataset_path, "train"), | |||
cfg.batch_size) | |||
if ds_train.get_dataset_size() == 0: | |||
raise ValueError( | |||
"Please check dataset size > 0 and batch_size <= dataset size") | |||
#创建网络 | |||
network = LeNet5(cfg.num_classes) | |||
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") | |||
net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) | |||
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) | |||
if args.device_target != "Ascend": | |||
model = Model(network, | |||
net_loss, | |||
net_opt, | |||
metrics={"accuracy": Accuracy()}) | |||
else: | |||
model = Model(network, | |||
net_loss, | |||
net_opt, | |||
metrics={"accuracy": Accuracy()}, | |||
amp_level="O2") | |||
config_ck = CheckpointConfig( | |||
save_checkpoint_steps=cfg.save_checkpoint_steps, | |||
keep_checkpoint_max=cfg.keep_checkpoint_max) | |||
#定义模型输出路径 | |||
ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", | |||
directory=args.save_checkpoint_path, | |||
config=config_ck) | |||
#开始训练 | |||
print("============== Starting Training ==============") | |||
epoch_size = cfg['epoch_size'] | |||
if (args.epoch_size): | |||
epoch_size = args.epoch_size | |||
print('epoch_size is: ', epoch_size) | |||
model.train(epoch_size, | |||
ds_train, | |||
callbacks=[time_cb, ckpoint_cb, | |||
LossMonitor()]) | |||
######################## 将输出的模型拷贝到obs(固定写法) ######################## | |||
# 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载 | |||
try: | |||
mox.file.copy_parallel(args.train_url, obs_train_url) | |||
print("Successfully Upload {} to {}".format(args.train_url, | |||
obs_train_url)) | |||
except Exception as e: | |||
print('moxing upload {} to {} failed: '.format(args.train_url, | |||
obs_train_url) + str(e)) | |||
######################## 将输出的模型拷贝到obs ######################## |
@@ -0,0 +1,242 @@ | |||
# 第13课-CIFAR-10图像识别项目实战 | |||
## 如何基于云脑1和云脑2调试任务 | |||
hi,大家好,经历了前面多轮课程,相比大家已经对平台的各部分功能差不多都熟悉了,恭喜大家已经不再是当初那个小白啦~ | |||
今天我们通过平台演示一个猫狗图像识别的小项目,将前面所学到的知识串联起来加以巩固,开始向启智社区达人迈进! | |||
猫狗识别是图像分类的经典案例之一,大家感兴趣的可以自行百度,此次就不过多介绍了。 | |||
本节课主要演示如何在云脑1和云脑2进行任务调试,大致内容如下: | |||
> 代码和数据集上传 | |||
> 分别进入云脑1和云脑2环境调试任务 | |||
> 进入云脑2环境训练任务 | |||
话不多说,接下来进入猫狗图像识别的项目实操 | |||
### 一、新建项目并上传代码和数据集 | |||
#### 1. 新建项目 | |||
在【个人中心】页面,点击页面右侧【项目列表】旁边的【+】,开始创建项目,填好相关信息,勾选✔初始化存储库,点击【创建项目】 | |||
<div align="center"> | |||
<img src= img/L14-1.gif width=100%> | |||
</div> | |||
<br> | |||
#### 2. 上传代码文件 | |||
项目创建后,在【代码】页面点击【上传文件】进入文件上传页面,在【添加目录】处输入目录名称(即文件夹名称),直接从本地将代码文件拖入上传框 | |||
(代码文件可自行从网上下载,或在OpenI_Learning项目进行下载,代码文件名【case_Cats and Dogs】) | |||
<div align="center"> | |||
<img src= img/L14-2.gif width=100%> | |||
</div> | |||
<br> | |||
此处上传了两个文件,case1代码表示从云脑1进行调试,case2代码表示从云脑2进行调试 | |||
#### 3. 上传数据集 | |||
猫狗数据集可从[Kaggle](https://www.kaggle.com/datasets)下载,也可以从OpenI_Learning项目下载,数据集名称【cifar-10-batches-py.zip】。 | |||
下载好数据集后,进入【数据集】页面,点击【CPU/GPU】选择云脑1,将数据集直接拖入上传框内,【Ascend NPU】为云脑2,分别上传数据集(云脑1和云脑2数据集不共用,所以需选择相应的环境上传数据集) | |||
<div align="center"> | |||
<img src= img/L14-3.png width=80%> <br><img src= img/L14-4.png width=80%> | |||
</div> | |||
<br> | |||
### 二、进入云脑环境调试任务 | |||
云脑1和云脑2都可以调试任务,这里我们依次进行演示。 | |||
#### 1. 云脑1调试任务 | |||
##### a. 新建调试任务 | |||
在【个人中心】页面点击【云脑】,选择CPU/GPU(即云脑1环境),点击右侧按钮【新建调试任务】 | |||
<div align="center"> | |||
<img src= img/L14-5.png width=100%> | |||
</div> | |||
<br> | |||
进入新建任务页面,更改或默认任务名称,镜像要求基于Python 3.6+ 和PyTorch 1.0+的环境,可自行配置或选择已经配置好的其他环境,数据集选择对应已上传的数据集(可直接输入数据集名称关键词),其他选项默认即可 | |||
<div align="center"> | |||
<img src= img/L14-6.png width=100%> | |||
</div> | |||
<br> | |||
任务创建好后,等待十秒左右待状态从CREATING变为RUNNING,点击页面的【调试】进入调试环境 | |||
<div align="center"> | |||
<img src= img/L14-7.png width=100%> | |||
</div> | |||
<br> | |||
##### b. 进入云脑1环境 | |||
点击调试后,会跳转到云脑的调试环境 | |||
<div align="center"> | |||
<img src= img/L14-8.png width=100%> | |||
</div> | |||
<br> | |||
##### c. 云脑1环境调试任务 | |||
此时,大家可以进行自定义调试和运行代码了。 | |||
> #ls | |||
> #cd /code/case1 (相应代码放在/code下,相应数据集放在/dataset下) | |||
> #python main.py | |||
<div align="center"> | |||
<img src= img/L14-9.png width=100%> | |||
</div> | |||
<br> | |||
由于每个epoch要训练5万张,在1W张里面进行测试,运行过程会有点儿漫长,但云脑1的调试已经跑通,静等运行结束即可。 | |||
<div align="center"> | |||
<img src= img/L14-10.png width=100%> | |||
</div> | |||
<br> | |||
--- | |||
接下来我们演示从云脑2进行调试 | |||
#### 2. 云脑2调试任务 | |||
##### a. 新建调试任务 | |||
同样地,在【云脑】页面下来【CPU/GPU】选项框,选择【Ascend NPU】,点击右侧按钮【新建调试任务】 | |||
<div align="center"> | |||
<img src= img/L14-11.png width=100%> | |||
</div> | |||
<br> | |||
进入新建任务页面,选择相应数据集,其他为默认即可,点击【新建任务】 | |||
<div align="center"> | |||
<img src= img/L14-12.png width=100%> | |||
</div> | |||
<br> | |||
调试任务创建完成后,稍等十秒,待状态变为RUNNING后,点击【调试】 | |||
<div align="center"> | |||
<img src= img/L14-13.png width=100%> | |||
</div> | |||
<br> | |||
##### b. 进入云脑2环境 | |||
点击调试后,进入调试界面,勾选相应数据集,点击【Sync OBS】进行同步后会弹出确认框,勾选✔【YES】即可,待页面提示成功,关闭即可 | |||
<div align="center"> | |||
<img src= img/L14-3.gif width=100%> | |||
</div> | |||
<br> | |||
点击页面右侧的【New】,创建调试用的Notebook,选择【MindSpore】环境 | |||
<div align="center"> | |||
<img src= img/L14-14.png width=100%> | |||
</div> | |||
<br> | |||
此时,页面跳转到调试环境 | |||
<div align="center"> | |||
<img src= img/L14-15.png width=100%> | |||
</div> | |||
<br> | |||
##### c. 云脑2环境调试任务 | |||
我们先将代码克隆过来 | |||
> 在【个人中心】的【代码】界面,点击右侧按钮【复制链接】 | |||
> 回到调试界面,输入 !git clone 后粘贴链接 | |||
> 点击【Run】运行代码,提示克隆成功 | |||
<div align="center"> | |||
<img src= img/L14-4.gif width=100%> | |||
</div> | |||
<br> | |||
接下来解压数据集,在代码中输入 !unzip + 文件名称,点击运行 | |||
<div align="center"> | |||
<img src= img/L14-16.png width=100%> | |||
</div> | |||
<br> | |||
最后,运行代码 !python OpenI_test/case2/train.py --dataset_path ./cifar-10-batches-bin/ | |||
<div align="center"> | |||
<img src= img/L14-17.png width=100%> | |||
</div> | |||
<br> | |||
### 三、云脑2训练任务 | |||
#### 1. 新建训练任务 | |||
在云脑页面,选择【Ascend NPU】,点击【训练任务】,再点击【新建训练任务】 | |||
<div align="center"> | |||
<img src= img/L14-18.png width=100%> | |||
</div> | |||
<br> | |||
在弹出的任务创建页面中,选择数据集【cifar.zip】,指定文件中输入“case2/train.py”,其他默认即可,点击【新建任务】 | |||
<div align="center"> | |||
<img src= img/L14-19.png width=100%> | |||
</div> | |||
<br> | |||
#### 2. 训练任务 | |||
点击了新建任务之后,待状态由INIT变为RUNNING之后,模型将开始启动训练,运行时长大约几分钟左右 | |||
<div align="center"> | |||
<img src= img/L14-20.png width=100%> | |||
</div> | |||
<br> | |||
#### 3. 模型下载 | |||
当任务状态变为“COMPLETED”,表示任务训练成功并已结束。 | |||
<div align="center"> | |||
<img src= img/L14-21.png width=100%> | |||
</div> | |||
<br> | |||
点击操作栏的【模型下载】可下载模型 | |||
<div align="center"> | |||
<img src= img/L14-22.png width=100%> | |||
</div> | |||
<br> | |||
至此,我们已经在云脑环境(云脑1和云脑2)都顺利调试和训练了模型,通过这次项目实战,相信童鞋们又进一步熟悉了平台的强大功能,后面在实践自己项目时就可以得心应手了~ |