#2056 GPU训练任务详情增加展示运行简况以及错误诊断

Merged
ychao_1983 merged 1 commits from fix-2055 into V20220519 2 years ago
  1. +15
    -27
      routers/repo/cloudbrain.go
  2. +61
    -5
      templates/repo/cloudbrain/trainjob/show.tmpl

+ 15
- 27
routers/repo/cloudbrain.go View File

@@ -491,34 +491,22 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
}
}
taskRoles := jobRes.TaskRoles
if jobRes.JobStatus.State != string(models.JobFailed) {

taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
ctx.Data["taskRes"] = taskRes
task.Status = taskRes.TaskStatuses[0].State
task.ContainerID = taskRes.TaskStatuses[0].ContainerID
task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
models.ParseAndSetDurationFromCloudBrainOne(jobRes, task)

if task.DeletedAt.IsZero() { //normal record
err = models.UpdateJob(task)
if err != nil {
ctx.Data["error"] = err.Error()
return
}
} else { //deleted record

taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
ctx.Data["taskRes"] = taskRes
ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics
task.Status = taskRes.TaskStatuses[0].State
task.ContainerID = taskRes.TaskStatuses[0].ContainerID
task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
models.ParseAndSetDurationFromCloudBrainOne(jobRes, task)

if task.DeletedAt.IsZero() { //normal record
err = models.UpdateJob(task)
if err != nil {
ctx.Data["error"] = err.Error()
return
}
} else {
task.Status = jobRes.JobStatus.State
taskRes := models.TaskPod{TaskStatuses: []models.TaskStatuses{
{
State: jobRes.JobStatus.State,
},
}}
ctx.Data["taskRes"] = taskRes
jobRes.JobStatus.StartTime = time.Unix(int64(task.CreatedUnix), 0).Format("2006-01-02 15:04:05")
jobRes.JobStatus.EndTime = time.Unix(int64(task.UpdatedUnix), 0).Format("2006-01-02 15:04:05")
} else { //deleted record

}

ctx.Data["result"] = jobRes


+ 61
- 5
templates/repo/cloudbrain/trainjob/show.tmpl View File

@@ -214,8 +214,9 @@ td, th {
<div class="content-pad">
<div class="ui pointing secondary menu" style="border-bottom: 1px solid rgba(34,36,38,.15);">
<a class="active item" data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item" data-tab="second{{$k}}" onclick="loadLog({{.VersionName}})">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item" data-tab="third{{$k}}" onclick="loadModelFile({{.VersionName}},'','','init')">{{$.i18n.Tr "repo.model_download"}}</a>
<a class="item" data-tab="second{{$k}}" onclick="javascript:parseInfo()">{{$.i18n.Tr "repo.cloudbrain.runinfo"}}</a>
<a class="item" data-tab="third{{$k}}" onclick="loadLog({{.VersionName}})">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item" data-tab="four{{$k}}" onclick="loadModelFile({{.VersionName}},'','','init')">{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">
<div style="padding-top: 10px;">
@@ -376,7 +377,25 @@ td, th {
</div>
</div>

<div class="ui tab" data-tab="second{{$k}}">
<div>
<div class="ui message message{{.VersionName}}" style="display: none;">
<div id="header"></div>
</div>
<div class="ui attached log" id="log{{.VersionName}}" style="height: 390px !important; overflow: auto;">
<input type="hidden" id="json_value" value="{{$.result.JobStatus.AppExitDiagnostics}}">
<input type="hidden" id="ExitDiagnostics" value="{{$.ExitDiagnostics}}">
<span id="info_display" class="info_text">
</span>
</div>

</div>

</div>

<div class="ui tab" data-tab="third{{$k}}">
<div>
<div class="ui message message{{.VersionName}}" style="display: none;">
<div id="header"></div>
@@ -391,7 +410,7 @@ td, th {
</div>

<div class="ui tab" data-tab="third{{$k}}">
<div class="ui tab" data-tab="four{{$k}}">
<input type="hidden" name="model{{.VersionName}}" value="-1">
<input type="hidden" name="modelback{{.VersionName}}" value="-1">
<div class='ui breadcrumb model_file_bread' id='file_breadcrumb{{.VersionName}}'>
@@ -609,10 +628,10 @@ td, th {
// }
let status = $(`#${versionname}-status-span`).text()
if(['IMAGE_FAILED','SUBMIT_FAILED','DELETE_FAILED','KILLED','COMPLETED','FAILED','CANCELED','LOST','START_FAILED','SUCCEEDED'].includes(status)){
if(['IMAGE_FAILED','SUBMIT_FAILED','DELETE_FAILED','KILLED','COMPLETED','FAILED','CANCELED','LOST','START_FAILED','SUCCEEDED','STOPPED'].includes(status)){
return
}
let stopArray=["KILLED","FAILED","START_FAILED","KILLING","COMPLETED","SUCCEEDED"]
let stopArray=["KILLED","FAILED","START_FAILED","KILLING","COMPLETED","SUCCEEDED","STOPPED"]
$.get(`/api/v1/repos/${repoPath}/cloudbrain/${taskID}?version_name=${versionname}`, (data) => {
//$(`#${versionname}-duration-span`).text(data.JobDuration)
$(`#${versionname}-status-span span`).text(data.JobStatus)
@@ -649,5 +668,42 @@ td, th {
});
stopBubbling(arguments.callee.caller.arguments[0])
}

function parseInfo(){
let jsonValue = document.getElementById("json_value").value;
let jsonObj = JSON.parse(jsonValue);
let podRoleName = jsonObj["podRoleName"];
let html = "";
if (podRoleName != null){
let task0 = podRoleName["task1-0"];
let podEvents = jsonObj["podEvents"];
let podEventArray = podEvents[task0];
if(podEventArray != null){
for(var i=0; i < podEventArray.length;i++){
html +="<p><b>[" +podEventArray[i]["reason"] + "]</b></p>";
html +="<p>" +podEventArray[i]["message"] + "</p>";
html +="<p>" +podEventArray[i]["action"] + "</p>";
}
}
let extras= jsonObj["extras"];
if(extras != null){
for(var i=0; i < extras.length;i++){
html +="<p><b>[" +extras[i]["reason"] + "]</b></p>";
html +="<p>" +extras[i]["message"] + "</p>";
html +="<p>" +extras[i]["action"] + "</p>";
}
}
}

let string = document.getElementById("ExitDiagnostics").value;
string = string.replace(/\r\n/g,"<br>")
string = string.replace(/\n/g,"<br>");
string = string.replace(/(\r\n)|(\n)/g,'<br>');

html +="<p><b>[ExitDiagnostics]</b></p>";
html +="<p>" +string + "</p>";
document.getElementById("info_display").innerHTML=html;
}
</script>

Loading…
Cancel
Save