#32 910b训练环境一直报错,调试环境正常

Open
created 2 months ago by shengshanbai04 · 8 comments
time="2025-01-20T12:09:13+08:00" level=info msg="init logger successful" file="init.go:55" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="current user 1000:100" file="init.go:57" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="report event InitStart success" file="event.go:82" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="init command: bash /home/ma-user/training/init.sh ''" file="init.go:81" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="scc is already installed, skipping this step..." Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[init] toolkit_obs_upload_pid = 56" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[init] running at 2025-01-20-12:09:13" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[init] ip of the pod: 172.16.0.60" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="local dir = /home/ma-user/modelarts/log/" file="upload.go:206" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:13+08:00" level=info msg="obs dir = s3://modelarts-training-log-cn-central-221/16edef32-d900-4530-b12e-7158febf3c2a/worker-0" file="upload.go:209" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:13+08:00" level=info msg="num of workers = 8" file="upload.go:214" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:13+08:00" level=info msg="start the periodic upload task, upload Period = 5 seconds " file="upload.go:220" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:13+08:00" level=info msg="report event DetectStart success" file="event.go:82" Command=report Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[task]Detect item: disk-size cache" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[detect] code: 0, message: ok, item: disk-size cache" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[task]Detect item: dns" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:13+08:00" level=info msg="[detect] code: 0, message: ok, item: dns" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:14+08:00" level=info msg="[task]Detect item: disk-size root" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:14+08:00" level=info msg="[detect] code: 0, message: ok, item: disk-size root" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:14+08:00" level=info msg="[task]Detect item: disk-size shm" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:14+08:00" level=info msg="[detect] code: 0, message: ok, item: disk-size shm" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:14+08:00" level=info msg="report event DetectFinish success" file="event.go:82" Command=report Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:17+08:00" level=info msg="[init] autosearch_path is empty, skip the autosearch download" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:17+08:00" level=info msg="[init] code_url is empty, skip the code download." Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:17+08:00" level=info msg="[init] record_dir is empty, skip the code upload" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:17+08:00" level=info msg="[init] inputs_handler_job_pid = 283" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:17+08:00" level=info msg="env MA_INPUTS is empty, skip the inputs handler" Component=PythonScripts Platform=ModelArts-Service time="2025-01-20T12:09:18+08:00" level=info msg="[init] exiting at 2025-01-20-12:09:18" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:18+08:00" level=info msg="[init] upload_metrics_pid = 338" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:18+08:00" level=info msg="MA_OUTPUT_PRELOAD_SUFFIX: ." file="preload.go:27" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:18+08:00" level=info msg="modelarts output channel preload to memarts switch is: false." file="preload.go:40" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:18+08:00" level=info msg="[init] stop toolkit_obs_upload_pid = 56 by signal SIGTERM" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:18+08:00" level=info msg="the final upload is in progress..." file="upload.go:231" Command=obs/upload Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:19+08:00" level=info msg="[init] toolkit_obs_upload 56 ret_code is 0" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:19+08:00" level=info msg="[init] exit with 0" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:19+08:00" level=info msg="local dir = /home/ma-user/modelarts/log/" file="upload.go:206" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:19+08:00" level=info msg="obs dir = s3://modelarts-training-log-cn-central-221/16edef32-d900-4530-b12e-7158febf3c2a/worker-0" file="upload.go:209" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:19+08:00" level=info msg="num of workers = 8" file="upload.go:214" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:19+08:00" level=info msg="MA_OUTPUT_PRELOAD_SUFFIX: ." file="preload.go:27" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:19+08:00" level=info msg="modelarts output channel preload to memarts switch is: false." file="preload.go:40" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service Task= time="2025-01-20T12:09:19+08:00" level=info msg="report event InitExit success" file="event.go:82" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:19+08:00" level=info msg="bootstrap is exiting with exit code 0" file="bootstrap.go:278" Command=bootstrap/init Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:20Z" level=info msg="task-training container update heartbeat file started" file="heartbeat.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:20Z" level=info msg="init logger successful" file="run_train.go:118" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:20Z" level=info msg="the core file size limit is unlimited\n" file="core_file_collect.go:34" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:20Z" level=info msg="create core file symbolic link to /home/ma-user/modelarts/log successful" file="core_file_collect.go:68" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:20Z" level=info msg="Waiting for SCC server start." file="run_train.go:561" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="init logger successful" file="upload.go:44" Command=bootstrap/upload Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="start checking secret refresh time." file="secret.go:46" Command=bootstrap/upload Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="current user 0:0" file="upload.go:52" Command=bootstrap/upload Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="report event SidecarStart success" file="event.go:82" Command=bootstrap/upload Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="upload command: /home/ma-user/training/sidecar.sh" file="upload.go:77" Command=bootstrap/upload Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="watch periodically task-training container heartbeat file started, periodic interval:heartbeat update:60s, heartbeat check:60s, heartbeat timeout:180s" file="heartbeat.go:42" Command=bootstrap/upload Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="scc is already installed, skipping this step..." Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="[sidecar] running at 2025-01-20-12:09:20" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="[sidecar] scc server pid = 41" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="[sidecar] toolkit_host_log_collection_pid = 45" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="[sidecar] toolkit_obs_upload_by_channels_pid = 47" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="[sidecar] waiting for training complete" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="start host log collection routine, dump period: 30s, host log dir: /var/log/, local log dir: /home/ma-user/modelarts/log" file="host_log.go:40" Command=host-log-collection Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="MA_OUTPUTS environment variable is empty, skip creating upload tasks." file="upload_by_channels.go:58" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="local dir = /home/ma-user/modelarts/log/" file="upload.go:206" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T12:09:20+08:00" level=info msg="obs dir = s3://modelarts-training-log-cn-central-221/16edef32-d900-4530-b12e-7158febf3c2a/worker-0" file="upload.go:209" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T12:09:20+08:00" level=info msg="enable append upload mode" file="upload.go:212" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T12:09:20+08:00" level=info msg="num of workers = 8" file="upload.go:214" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T12:09:20+08:00" level=info msg="start the periodic upload task, upload Period = 5 seconds " file="upload.go:220" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T12:09:20+08:00" level=info msg="Starting SCC server on 127.0.0.1:57528" file="server.go:52" Command=scc-server Component=ma-scc-server Platform=ModelArts-Service time="2025-01-20T12:09:20+08:00" level=info msg="local dir = /home/ma-user/modelarts/log/" file="upload.go:206" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=log_url time="2025-01-20T12:09:20+08:00" level=info msg="obs dir = obs://grampus/log/" file="upload.go:209" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=log_url time="2025-01-20T12:09:20+08:00" level=info msg="num of workers = 8" file="upload.go:214" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=log_url time="2025-01-20T12:09:20+08:00" level=info msg="start the periodic upload task, upload Period = 30 seconds " file="upload.go:220" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=log_url time="2025-01-20T04:09:25Z" level=info msg="SCC server has been init, training continue." file="run_train.go:579" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="report event DetectStart success" file="event.go:82" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=warning msg="DCMIChecker create failed, error: DCMI detection does not currently support custom images, skip this detect step" file="ascend_check.go:88" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=warning msg="ascend-dmi executor create failed, error: ascend-dmi is not found in the /usr/local/Ascend/toolbox path, skip this detect step" file="ascend_check.go:101" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="report event DetectFinish success" file="event.go:82" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="current user name: ma-user" file="image_check.go:69" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="current uid: 1000" file="image_check.go:70" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="current gid: 100" file="image_check.go:71" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=warning msg="can't access /usr/local/seccomponent/lib/, check whether install scc lib" file="image_check_unix.go:42" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="image check passed" file="image_check.go:56" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="hccn sampler routine is started, sample interval is 10s" file="sample_routine.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:09:25+08:00" level=info msg="MA_OUTPUT_PRELOAD_SUFFIX: ." file="preload.go:27" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T12:09:25+08:00" level=info msg="modelarts output channel preload to memarts switch is: false." file="preload.go:40" Command=obs/upload_by_channels Component=ma-training-toolkit Platform=ModelArts-Service Task=srt_log_collection time="2025-01-20T04:09:25Z" level=info msg="npu-smi sampler routine is started, sample interval is 10s" file="sample_routine.go:100" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="report event TrainingStart success" file="event.go:82" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="Skip hang detect" file="run_train.go:634" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="pre train command: mkdir -p ~/.pip; echo -e '[global]\\ntrusted-host = 100.125.0.76:32021\\nindex-url = http://100.125.0.76:32021/repository/pypi/simple/\\ntimeout = 120' > ~/.pip/pip.conf; " file="run_train.go:788" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="pre runtime info collection started" file="run_train.go:1009" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ib_gids, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ib_driver_version_on_host, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting gpu_driver_version, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting cudnn_version, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting cuda_version, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ib_dev_to_net_dev, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting gpu_topo, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting gpu_info, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ascend_dmi, reason: ascend-dmi executor create failed, error: ascend-dmi is not found in the /usr/local/Ascend/toolbox path" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ethernet_qos, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ofed_info_output, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting ib_stats, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting nccl_version, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:25Z" level=info msg="skip collecting nfs_mount_path_check, reason: MA_NFS_MOUNT_VOLUMES env is not found" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="skip collecting hccn_tool, reason: empty results" file="collector.go:151" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="pre runtime info collection finished" file="run_train.go:1025" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=warning msg="the DEFAULT_CONDA_ENV_NAME env is MindSpore, but the ANACONDA_DIR env is empty, skip setting default conda env" file="run_train.go:769" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="run command: export CODE_NEED_UNZIP=true;export PRETRAIN_MODEL_NEED_UNZIP=false;export DATASET_NEED_UNZIP=true;mkdir -p /cache/code;export LOCAL_CODE_PATH=/cache/code;mkdir -p /cache/dataset;export LOCAL_DATASET_PATH=/cache/dataset;mkdir -p /cache/pretrainmodel;export LOCAL_PRETRAIN_MODEL_PATH=/cache/pretrainmodel;mkdir -p /cache/output;export LOCAL_OUTPUT_PATH=/cache/output;export DATA_DOWNLOAD_METHOD=MOXING;source /home/ma-user/.bashrc;python /home/ma-user/davinci/train/davincirun.py python /home/ma-user/grampus.py --'code_url'='s3:///urchincache/job/sheng2025012012t083443716/code/master.zip' --'grampus_code_file_name'='pre_and_suf.py' --'grampus_code_url'='s3:///grampus/system_code/' --'model_url'='s3:///grampus/job/sheng2025012012t083443716/output/' --'multi_data_url'='[{\"dataset_url\":\"s3:///urchincache/jobs/sheng2025012012t083443716/cache/dataset/\",\"dataset_name\":\"dataset\",\"containerPath\":\"/cache/dataset\",\"readOnly\":false,\"isNeedUnzip\":true},{\"dataset_url\":\"s3:///urchincache/attachment/a/5/a5f475a1-93ac-44c8-b88b-f4c3cf6c887a/cifar-10-python.tar.gz\",\"dataset_name\":\"cifar-10-python.tar.gz\",\"containerPath\":\"/cache/dataset/cifar-10-python.tar.gz\",\"readOnly\":true,\"isNeedUnzip\":true}]' --'pretrain_url'='[{\"model_url\":\"s3:///urchincache/aimodels/a/4/a4257430-1c9c-46d3-adc2-7bd34098d937/\",\"model_name\":\"resnet50-0676ba61\",\"containerPath\":\"/cache/pretrainmodel/resnet50-0676ba61\",\"readOnly\":true,\"isNeedUnzip\":false}]' --'boot_file'='train_npu.py' --'code_name'='train';result=$?;bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\"; " file="run_train.go:463" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="auto set NCCL_SOCKET_IFNAME env: NCCL_SOCKET_IFNAME=eth0" file="run_train.go:406" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: listenAndServe) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="http server listening at ma-job-16edef32-d900-4530-b12e-7158febf3c2a-worker-0.ma-job-16edef32-d900-4530-b12e-7158febf3c2a:9527" file="server.go:62" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: checkIn) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: checkIn) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event(name: CheckedInEvent, msg: ) is being handled, len: 0" file="controller.go:63" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="received check-in from task worker-0" file="handler.go:135" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="done handling event(name: CheckedInEvent, msg: )" file="controller.go:72" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: startAndWait) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: watchHangMetric) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: instructAllStart) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: instructAllStart) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="no env MA_HANG_RETRY_NUM found, will do nothing" file="watcher.go:49" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=warning msg="hang metric file watcher is empty, skip the watch step" file="listener.go:499" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: watchHangMetric) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: watchNPUResetConfig) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: watchGPUResetConfig) started" file="controller.go:97" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="add path /etc/reset/reset.json to npu reset config watcher sucessful" file="watcher.go:44" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="hot reset config file watcher is running, npu reset config path: /etc/reset/reset.json" file="watcher.go:54" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="event listener(name: watchGPUResetConfig) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="start command successfully, pid: 127, command: [bash -c bash /modelarts-job-16edef32-d900-4530-b12e-7158febf3c2a/MA-CUSTOM-COMMAND.sh 2>&1]" file="process.go:145" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:27Z" level=info msg="zombie process cleaner is start running, childPid=127" file="cleaner_unix.go:31" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service sh: /modelarts/authoring/script/entrypoint/common/terminal_tips.sh: No such file or directory INFO:root:Using MoXing-v2.1.16.2ae09d45-2ae09d45 INFO:root:Using OBS-Python-SDK-3.20.9.1 [ModelArts Service Log]2025-01-20 04:09:29,101 - INFO - Ascend Driver: Version=23.0.3 [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - you are advised to use ASCEND_DEVICE_ID env instead of DEVICE_ID, as the DEVICE_ID env will be discarded in later versions [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - particularly, ${ASCEND_DEVICE_ID} == ${DEVICE_ID}, it's the logical device id [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - Davinci training command [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - ['python', '/home/ma-user/grampus.py', '--code_url=s3:///urchincache/job/sheng2025012012t083443716/code/master.zip', '--grampus_code_file_name=pre_and_suf.py', '--grampus_code_url=s3:///grampus/system_code/', '--model_url=s3:///grampus/job/sheng2025012012t083443716/output/', '--multi_data_url=[{"dataset_url":"s3:///urchincache/jobs/sheng2025012012t083443716/cache/dataset/","dataset_name":"dataset","containerPath":"/cache/dataset","readOnly":false,"isNeedUnzip":true},{"dataset_url":"s3:///urchincache/attachment/a/5/a5f475a1-93ac-44c8-b88b-f4c3cf6c887a/cifar-10-python.tar.gz","dataset_name":"cifar-10-python.tar.gz","containerPath":"/cache/dataset/cifar-10-python.tar.gz","readOnly":true,"isNeedUnzip":true}]', '--pretrain_url=[{"model_url":"s3:///urchincache/aimodels/a/4/a4257430-1c9c-46d3-adc2-7bd34098d937/","model_name":"resnet50-0676ba61","containerPath":"/cache/pretrainmodel/resnet50-0676ba61","readOnly":true,"isNeedUnzip":false}]', '--boot_file=train_npu.py', '--code_name=train'] [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - Wait for Rank table file ready [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - Rank table file (K8S generated) is ready for read [ModelArts Service Log]2025-01-20 04:09:29,102 - INFO - { "status": "completed", "group_count": "1", "group_list": [ { "group_name": "worker", "device_count": "1", "instance_count": "1", "instance_list": [ { "pod_name": "ma-job-16edef32-d900-4530-b12e-7158febf3c2a-worker-0", "server_id": "172.18.161.169", "devices": [ { "device_id": "0", "device_ip": "29.112.105.39" } ] } ] } ] } Traceback (most recent call last): File "/home/ma-user/davinci/train/davincirun.py", line 62, in <module> instance = rank_table.get_current_instance() File "/home/ma-user/davinci/train/rank_table.py", line 280, in get_current_instance server_list = self.rank_table['server_list'] KeyError: 'server_list' time="2025-01-20T04:09:29Z" level=info msg="command is exit with 255" file="process.go:176" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="no more children pid, we are done" file="cleaner_unix.go:86" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="NPU training process exits with exit code 255, and the environment will be retained for 0s." file="process.go:215" time="2025-01-20T04:09:29Z" level=info msg="the environment has been retained for 0s." file="process.go:217" time="2025-01-20T04:09:29Z" level=info msg="DetectResult msg no error\n" file="process.go:195" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="event listener(name: startAndWait) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="zombie process cleaner is exiting" file="cleaner_unix.go:53" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="event(name: UserProcessTerminatedEvent, msg: ) is being handled, len: 0" file="controller.go:63" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="the number of event listener is zero, registration is not required" file="controller.go:85" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="npu reset config file watcher has exited" file="watcher.go:59" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="NPU utils watcher is exiting without NPU startup observed" file="watch_routine.go:36" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="npu hot reset config listener is exiting" file="listener.go:432" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="npu hot reset configuration listener has exited" file="listener.go:452" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="event listener(name: watchNPUResetConfig) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="context is closing, shutting down the http server" file="server.go:51" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="http server shutdown successfully" file="server.go:58" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="event listener(name: listenAndServe) exited" file="controller.go:93" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="controller exit with 255" file="controller.go:78" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="post runtime info collection started" file="run_train.go:1061" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting mountstats, reason: MA_NFS_MOUNT_VOLUMES env is not found" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting nfs_metrics, reason: MA_NFS_MOUNT_VOLUMES env is not found" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting nvlink_status, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="collect metrics for NPU memory diagnose started" file="collector_npu.go:78" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting ib_abnormal_physical_state, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting fabric_manager_status, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="ascend-dmi executor create failed, error: ascend-dmi is not found in the /usr/local/Ascend/toolbox path, skip ascend-dmi after check step" file="ascend_check.go:114" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="collect metrics for NPU congestion diagnose started" file="collector_npu.go:35" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting gpu_info, reason: not a gpu job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting ib_stats, reason: not an infiniband job" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="collect core num failed, will set core num val to 0, err: query prometheus raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:140" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting node_ssd_usage, reason: query prom metrics failed, err: query prometheus raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="skip collecting node_iowait, reason: query prom metrics failed, err: query prometheus raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector.go:147" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="query prom raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:107" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="collect node memory usage failed, err: query prometheus raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:153" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="query prom raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:107" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="collect metrics for NPU memory diagnose finished" file="collector_npu.go:125" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="collect iowait failed, err: query prometheus raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:167" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="query prom raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:191" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=warning msg="query prom raw metrics failed, err: error_code: APIGW.0101, error_msg: The API does not exist: method GET not found" file="collector_npu.go:191" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:09:29Z" level=info msg="collect metrics for NPU congestion diagnose finished" file="collector_npu.go:71" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:10:19Z" level=warning msg="post runtime info collection timeout" file="run_train.go:1081" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:10:19Z" level=info msg="report event TrainingExit success" file="event.go:82" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:10:19Z" level=error msg="bootstrap is exiting with exit code 255" file="bootstrap.go:280" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T04:10:19Z" level=info msg="retCode 255 has been written to the retCode file /home/ma-user/modelarts/retCode" file="bootstrap.go:258" Command=bootstrap/run Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[sidecar] training is completed" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[sidecar] the reason for the failure of the training job is under analysis" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="the log-preview-size parameter exceeds the limit and will be set to the default value 5242880" file="cli.go:236" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[sidecar] fault diagnose starting" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[log extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [cuda version extractor] is empty for reason: not a gpu job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [cudnn version extractor] is empty for reason: not a gpu job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [gpu driver version extractor] is empty for reason: not a gpu job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[DF extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[env extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[modules extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [IB state extractor] is empty for reason: not an IB job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[ifconfig extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [ethernet qos extractor] is empty for reason: not an IB job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [gpuInfo extractor] is empty for reason: not a gpu job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [hostLog extractor] is empty for reason: open /home/ma-user/modelarts/log/messages-during-ma-job-16edef32-d900-4530-b12e-7158febf3c2a.log: no such file or directory" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [ibGid extractor] is empty for reason: not an IB job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[hugePageEnabled extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [ofedVersion extractor] is empty for reason: not an IB job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[basicMetrics extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [psOutput extractor] is empty for reason: open /home/ma-user/modelarts/log/runtime_info_after_starting_train_ma-job-16edef32-d900-4530-b12e-7158febf3c2a-worker-0/ps_output: no such file or directory" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[uLimit extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [mountStats extractor] is empty for reason: open /home/ma-user/modelarts/log/runtime_info_post_train_ma-job-16edef32-d900-4530-b12e-7158febf3c2a-worker-0/mountstats: no such file or directory" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [gpuTopo extractor] is empty for reason: not a gpu job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[directories permission extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [nfs mount extractor] is empty for reason: open /home/ma-user/modelarts/log/runtime_info_pre_train_ma-job-16edef32-d900-4530-b12e-7158febf3c2a-worker-0/nfs_mount_path_check: no such file or directory" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[pip list extractor] time required for extracting file content: 0 ms" file="analyzer.go:177" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [ib driver version in host extractor] is empty for reason: not an IB job, skip the extraction step" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=warning msg="extract [envs in processes extractor] is empty for reason: open /home/ma-user/modelarts/log/runtime_info_after_starting_train_ma-job-16edef32-d900-4530-b12e-7158febf3c2a-worker-0/runtime_processes_envs: no such file or directory" file="analyzer.go:172" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="time required for conclude: 0 ms" file="analyzer.go:245" Command=analyze Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[sidecar] stop toolkit_host_log_collection_pid = 45 by signal SIGTERM" Component=ShellScripts Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="ctx is closing, dump host logs for the last time" file="host_log.go:46" Command=host-log-collection Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="dump host logs to /home/ma-user/modelarts/log succeed" file="host_log.go:52" Command=host-log-collection Component=ma-training-toolkit Platform=ModelArts-Service time="2025-01-20T12:10:19+08:00" level=info msg="[sidecar] stop toolkit_obs_upload_by_channels_pid = 47 by signal SIGTERM" Component=ShellScripts Platform=ModelArts-Service failed
我也是
liuzx commented 2 months ago
Owner
任务名叫什么
shengshanbai04 commented 2 months ago
Poster
只要是选择910b,都还没进入训练脚本,就直接failed了,脚本打印print都还没输出
shengshanbai04 commented 2 months ago
Poster
这是已经失败的一个任务
157 KiB
liuzx commented 2 months ago
Owner
已反馈给开发,待修复
好的,应该不是我的问题
好的,应该不是我的问题
liuzx commented 3 days ago
Owner
用新的cann8.0.0镜像建任务
Sign in to join this conversation.
No Label
No Milestone
No Assignees
3 Participants
Notifications
Due Date

No due date set.

Dependencies

This issue currently doesn't have any dependencies.

Loading…
There is no content yet.