Deleting a branch is permanent. It CANNOT be undone. Continue?
Dear OpenI User
Thank you for your continuous support to the Openl Qizhi Community AI Collaboration Platform. In order to protect your usage rights and ensure network security, we updated the Openl Qizhi Community AI Collaboration Platform Usage Agreement in January 2024. The updated agreement specifies that users are prohibited from using intranet penetration tools. After you click "Agree and continue", you can continue to use our services. Thank you for your cooperation and understanding.
For more agreement content, please refer to the《Openl Qizhi Community AI Collaboration Platform Usage Agreement》
问题描述
基于Pytorch1.11的NPU镜像使用Ascend910多卡并行时报错。
相关环境(GPU/NPU)
NPU
相关集群(启智/智算)
智算集群
任务类型(调试/训练/推理)
训练任务
任务名
wuxin202311291026437
日志说明或问题截图
Traceback (most recent call last):
File "/cache/code/nuwa/src/utils/utils.py", line 42, in wrap
metric_dict, object_dict = task_func(cfg=cfg)
File "/cache/code/nuwa/src/tasks/train_task.py", line 63, in train
trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 771, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 721, in _call_and_handle_interrupt
return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 93, in launch
return function(*args, **kwargs)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1172, in _run
self.__setup_profiler()
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1797, in __setup_profiler
self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 2249, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/lightning_npu-0.0.0-py3.7.egg/lightning_npu/strategies/npu_parallel.py", line 105, in broadcast
self.broadcast_object_list(obj, src, group=_group.WORLD)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/lightning_npu-0.0.0-py3.7.egg/lightning_npu/strategies/npu_parallel.py", line 75, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/torch_npu/distributed/distributed_c10d.py", line 1060, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: HCCL error in: /usr1/workspace/FPTA_Daily_open_pytorchv1.11.0-3.0.tr6/CODE/torch_npu/csrc/distributed/ProcessGroupHCCL.cpp:402
EI9999: Inner Error, Please contact support engineer!
EI9999 host nic listen start failed, ip[0x6a0010ac], port[60002], return[11][FUNC:StartListenHostSocket][FILE:network_manager.cc][LINE:452]
TraceBack (most recent call last):
THPModule_npu_shutdown success.
/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/torchvision/transforms/functional_pil.py:207: DeprecationWarning: BILINEAR is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BILINEAR instead.
def resize(img, size, interpolation=Image.BILINEAR):
/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/torchvision/transforms/functional_pil.py:280: DeprecationWarning: BICUBIC is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.BICUBIC instead.
def perspective(img, perspective_coeffs, interpolation=Image.BICUBIC, fill=None):
/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/torch/utils/tensorboard/init.py:4: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
if not hasattr(tensorboard, 'version') or LooseVersion(tensorboard.version) < LooseVersion('1.15'):
Error executing job with overrides: ['trainer=npu_parallel.yaml', 'model=vit.yaml', 'paths=forecast_openi.yaml', 'datamodule=h5forecast.yaml']
Traceback (most recent call last):
File "/cache/code/nuwa/src/train.py", line 27, in main
metric_dict, _ = train(cfg)
File "/cache/code/nuwa/src/utils/utils.py", line 45, in wrap
raise ex
File "/cache/code/nuwa/src/utils/utils.py", line 42, in wrap
metric_dict, object_dict = task_func(cfg=cfg)
File "/cache/code/nuwa/src/tasks/train_task.py", line 63, in train
trainer.fit(model=model, datamodule=datamodule, ckpt_path=cfg.get("ckpt_path"))
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 771, in fit
self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 723, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 811, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1172, in _run
self.__setup_profiler()
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 1797, in __setup_profiler
self.profiler.setup(stage=self.state.fn._setup_fn, local_rank=local_rank, log_dir=self.log_dir)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 2249, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/lightning_npu-0.0.0-py3.7.egg/lightning_npu/strategies/npu_parallel.py", line 105, in broadcast
self.broadcast_object_list(obj, src, group=_group.WORLD)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/lightning_npu-0.0.0-py3.7.egg/lightning_npu/strategies/npu_parallel.py", line 75, in broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/home/ma-user/anaconda3/envs/PyTorch-1.11/lib/python3.7/site-packages/torch_npu/distributed/distributed_c10d.py", line 1060, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: store->get() got error: HCCL_BLOCKING_WAIT
期望的解决方案或建议
希望能够定位一下报错原因并提供完整的使用Pytorch NPU镜像的多卡并行的示例,或者能够支持Pytorch_NPU进行ddp训练的完整镜像。
这个镜像Pytorch1.11目前还不支持并行训练
希望可以更新支持并行的镜像,看Ascend官方文档是支持的