diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 86999d1b6b..8f2e6dd5ea 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1677,6 +1677,11 @@ type GetGrampusJobEventsResponse struct { JobEvents []GrampusJobEvents `json:"jobEvents"` TotalSize int `json:"totalSize"` } +type GetGrampusDebugJobEventsResponse struct { + GrampusResult + NotebookEvents []GrampusJobEvents `json:"notebookEvents"` + TotalSize int `json:"totalSize"` +} type GrampusTasks struct { Command string `json:"command"` @@ -1947,6 +1952,9 @@ func QueryModelTrainJobList(repoId int64) ([]*Cloudbrain, int, error) { // cond = cond.And( // builder.In("type", 0, 1), // ) + cond = cond.And( + builder.In("compute_resource", "NPU", "CPU/GPU"), + ) cloudbrains := make([]*Cloudbrain, 0) if err := sess.Select("*").Table(&Cloudbrain{}).Where(cond).OrderBy("created_unix DESC"). diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index b614b0d3cc..0315fd9d38 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -425,7 +425,39 @@ sendjob: return &result, nil } -func GetJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { +func GetDebugJobEvents(jobID string) (*models.GetGrampusDebugJobEventsResponse, error) { + checkSetting() + client := getRestyClient() + var result models.GetGrampusDebugJobEventsResponse + + retry := 0 + +sendjob: + _, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + urlNotebookJob + "/" + jobID + "/events") + + if err != nil { + return nil, fmt.Errorf("resty GetDebugJobEvents: %v", err) + } + + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return nil, fmt.Errorf("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetTrainJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { checkSetting() client := getRestyClient() var result models.GetGrampusJobEventsResponse @@ -438,7 +470,7 @@ sendjob: SetResult(&result). Get(HOST + urlTrainJob + "/" + jobID + "/events") if err != nil { - return nil, fmt.Errorf("resty GetJobEvents: %v", err) + return nil, fmt.Errorf("resty GetTrainJobEvents: %v", err) } if result.ErrorCode == errorIllegalToken && retry < 1 { @@ -449,8 +481,8 @@ sendjob: } if result.ErrorCode != 0 { - log.Error("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) - return nil, fmt.Errorf("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + log.Error("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return nil, fmt.Errorf("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index af37e9251a..0615e28d80 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -1122,6 +1122,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/grampus", func() { m.Group("/notebook", func() { m.Get("/:id", repo_ext.GetGrampusNotebook) + m.Get("/:id/job_event", repo_ext.GrampusDebugJobEvents) }) m.Group("/train-job", func() { m.Group("/:jobid", func() { @@ -1131,7 +1132,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/metrics", repo_ext.GrampusMetrics) m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload) m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog) - m.Get("/job_event", repo_ext.GrampusJobEvents) + m.Get("/job_event", repo_ext.GrampusTrainJobEvents) }) }) }, reqRepoReader(models.UnitTypeCloudBrain)) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 3cc30d161c..4a071bd37b 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1690,8 +1690,28 @@ func GrampusMetrics(ctx *context.Context) { return } +func GrampusDebugJobEvents(ctx *context.Context) { + ID := ctx.Params(":id") + job, err := models.GetCloudbrainByID(ID) + if err != nil { + log.Error("GetCloudbrainByID failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } + + result, err := grampus.GetDebugJobEvents(job.JobID) + if err != nil { + log.Error("GetDebugJobEvents failed: %v", err, ctx.Data["MsgID"]) + } + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": ID, + "JobEvents": result.NotebookEvents, + }) + + return +} -func GrampusJobEvents(ctx *context.Context) { +func GrampusTrainJobEvents(ctx *context.Context) { jobID := ctx.Params(":jobid") job, err := models.GetCloudbrainByJobID(jobID) if err != nil { @@ -1700,7 +1720,7 @@ func GrampusJobEvents(ctx *context.Context) { return } - result, err := grampus.GetJobEvents(job.JobID) + result, err := grampus.GetTrainJobEvents(job.JobID) if err != nil { log.Error("GetJobEvents failed: %v", err, ctx.Data["MsgID"]) } @@ -1775,12 +1795,12 @@ func generateCommand(repoName, processorType, bootFile, paramSrc, outputRemotePa commandCode = "source /home/ma-user/.bashrc;python /home/ma-user/davinci/train/davincirun.py python /home/ma-user/openi.py " + paramCode + ";" } else if processorType == grampus.ProcessorTypeGPU { if pretrainModelFileName != "" { - paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName+"'" + paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName + "'" } commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";" } else if processorType == grampus.ProcessorTypeGCU { if pretrainModelFileName != "" { - paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName+"'" + paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName + "'" } commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python3 " + bootFile + paramCode + ";" } diff --git a/templates/repo/grampus/notebook/gcu/new.tmpl b/templates/repo/grampus/notebook/gcu/new.tmpl index a83365efcb..365b135f1d 100644 --- a/templates/repo/grampus/notebook/gcu/new.tmpl +++ b/templates/repo/grampus/notebook/gcu/new.tmpl @@ -99,7 +99,7 @@ {{end}} - {{template "custom/select_model" .}} +