Browse Source

remove loss scale in GPU

master
lvyufeng 1 week ago
parent
commit
7d958af32c
2 changed files with 26 additions and 14 deletions
  1. +7
    -4
      src/gru_for_train.py
  2. +19
    -10
      train.py

+ 7
- 4
src/gru_for_train.py View File

@@ -243,7 +243,7 @@ class GRUTrainOneStepWithLossScaleCell(nn.Cell):

class GRUTrainOneStepCell(nn.TrainOneStepCell):
"""
Encapsulation class of bert network training.
Encapsulation class of GRU network training.

Append an optimizer to the training network after that the construct
function can be called to create the backward graph.
@@ -252,12 +252,15 @@ class GRUTrainOneStepCell(nn.TrainOneStepCell):
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default: 1.0.
enable_clip_grad (boolean): If True, clip gradients in GRUTrainOneStepCell. Default: True.
"""

def __init__(self, network, optimizer, sens=1.0):
def __init__(self, network, optimizer, sens=1.0, enable_clip_grad=True):
super(GRUTrainOneStepCell, self).__init__(network, optimizer, sens)
self.cast = P.Cast()
self.hyper_map = C.HyperMap()
self.clip_gradients = ClipGradients()
self.enable_clip_grad = enable_clip_grad

def set_sens(self, value):
self.sens = value
@@ -279,8 +282,8 @@ class GRUTrainOneStepCell(nn.TrainOneStepCell):
teacher_force,
self.cast(F.tuple_to_array((self.sens,)),
mstype.float32))
grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
if self.enable_clip_grad:
grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE)
grads = self.grad_reducer(grads)
succ = self.optimizer(grads)
return F.depend(loss, succ)

+ 19
- 10
train.py View File

@@ -29,7 +29,7 @@ from mindspore.train.loss_scale_manager import DynamicLossScaleManager
from mindspore.nn.optim import Adam
from src.config import config
from src.seq2seq import Seq2Seq
from src.gru_for_train import GRUWithLossCell, GRUTrainOneStepWithLossScaleCell
from src.gru_for_train import GRUWithLossCell, GRUTrainOneStepWithLossScaleCell, GRUTrainOneStepCell
from src.dataset import create_gru_dataset
from src.lr_schedule import dynamic_lr
set_seed(1)
@@ -82,13 +82,20 @@ class LossCallBack(Callback):
cb_params.cur_step_num,
str(cb_params.net_outputs)))
with open("./loss_{}.log".format(self.rank_id), "a+") as f:
f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format(
time_stamp_current - time_stamp_first,
cb_params.cur_epoch_num,
cb_params.cur_step_num,
str(cb_params.net_outputs[0].asnumpy()),
str(cb_params.net_outputs[1].asnumpy()),
str(cb_params.net_outputs[2].asnumpy())))
if context.get_context("device_target") == "Ascend":
f.write("time: {}, epoch: {}, step: {}, loss: {}, overflow: {}, loss_scale: {}".format(
time_stamp_current - time_stamp_first,
cb_params.cur_epoch_num,
cb_params.cur_step_num,
str(cb_params.net_outputs[0].asnumpy()),
str(cb_params.net_outputs[1].asnumpy()),
str(cb_params.net_outputs[2].asnumpy())))
else:
f.write("time: {}, epoch: {}, step: {}, loss: {}".format(
time_stamp_current - time_stamp_first,
cb_params.cur_epoch_num,
cb_params.cur_step_num,
str(cb_params.net_outputs.asnumpy())))
f.write('\n')

if __name__ == '__main__':
@@ -131,8 +138,10 @@ if __name__ == '__main__':
scale_factor=config.scale_factor,
scale_window=config.scale_window)
update_cell = scale_manager.get_update_cell()
netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell)

if args.device_target == "Ascend":
netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell)
else:
netwithgrads = GRUTrainOneStepCell(network, opt)
time_cb = TimeMonitor(data_size=dataset_size)
loss_cb = LossCallBack(rank_id=rank)
cb = [time_cb, loss_cb]


Loading…
Cancel
Save