(diff to make it use the same method for benchmarking speed - click to expand)
diff --git a/tools/train_utils/train_utils.py b/tools/train_utils/train_utils.py
index 91f21dd..021359d 100644
--- a/tools/train_utils/train_utils.py
+++ b/tools/train_utils/train_utils.py
@@ -2,6 +2,7 @@ import torch
import os
import glob
import tqdm
+import datetime
from torch.nn.utils import clip_grad_norm_
@@ -13,7 +14,10 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
if rank == 0:
pbar = tqdm.tqdm(total=total_it_each_epoch, leave=leave_pbar, desc='train', dynamic_ncols=True)
+ start_time = None
for cur_it in range(total_it_each_epoch):
+ if cur_it > 49 and start_time is None:
+ start_time = datetime.datetime.now()
try:
batch = next(dataloader_iter)
except StopIteration:
@@ -55,9 +59,11 @@ def train_one_epoch(model, optimizer, train_loader, model_func, lr_scheduler, ac
tb_log.add_scalar('learning_rate', cur_lr, accumulated_iter)
for key, val in tb_dict.items():
tb_log.add_scalar('train_' + key, val, accumulated_iter)
+ endtime = datetime.datetime.now()
+ speed = (endtime - start_time).seconds / (total_it_each_epoch - 50)
if rank == 0:
pbar.close()
- return accumulated_iter
+ return accumulated_iter, speed
def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_cfg,
@@ -65,6 +71,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
lr_warmup_scheduler=None, ckpt_save_interval=1, max_ckpt_save_num=50,
merge_all_iters_to_one_epoch=False):
accumulated_iter = start_iter
+ speeds = []
with tqdm.trange(start_epoch, total_epochs, desc='epochs', dynamic_ncols=True, leave=(rank == 0)) as tbar:
total_it_each_epoch = len(train_loader)
if merge_all_iters_to_one_epoch:
@@ -82,7 +89,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
cur_scheduler = lr_warmup_scheduler
else:
cur_scheduler = lr_scheduler
- accumulated_iter = train_one_epoch(
+ accumulated_iter, speed = train_one_epoch(
model, optimizer, train_loader, model_func,
lr_scheduler=cur_scheduler,
accumulated_iter=accumulated_iter, optim_cfg=optim_cfg,
@@ -91,7 +98,7 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
total_it_each_epoch=total_it_each_epoch,
dataloader_iter=dataloader_iter
)
-
+ speeds.append(speed)
# save trained model
trained_epoch = cur_epoch + 1
if trained_epoch % ckpt_save_interval == 0 and rank == 0:
@@ -107,6 +114,8 @@ def train_model(model, optimizer, train_loader, model_func, lr_scheduler, optim_
save_checkpoint(
checkpoint_state(model, optimizer, trained_epoch, accumulated_iter), filename=ckpt_name,
)
+ print(speed)
+ print(f'*******{sum(speeds) / len(speeds)}******')
def model_state_to_cpu(model_state):