From 647ff435e526d447e3c11e2100674391ff4cd52d Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 16 Mar 2022 14:18:23 +0800 Subject: [PATCH 01/31] index --- models/cloudbrain.go | 7 ++++++ models/file_chunk.go | 5 ---- modules/modelarts/modelarts.go | 1 - routers/admin/cloudbrains.go | 6 ++--- routers/repo/modelarts.go | 45 +++++++++++++++++++++++++--------- 5 files changed, 44 insertions(+), 20 deletions(-) mode change 100644 => 100755 routers/admin/cloudbrains.go diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 06c2e98b4f..96b827994e 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -19,9 +19,16 @@ type CloudbrainStatus string type JobType string type ModelArtsJobStatus string +const ( + TypeCloudBrainAll = -1 + TypeCloudBrainOne int = iota + TypeCloudBrainTwo +) + const ( NPUResource = "NPU" GPUResource = "CPU/GPU" + AllResource = "all" //notebook storage category EVSCategory = "EVS" diff --git a/models/file_chunk.go b/models/file_chunk.go index 76c926dc5f..0fc3a88794 100755 --- a/models/file_chunk.go +++ b/models/file_chunk.go @@ -13,11 +13,6 @@ const ( FileUploaded ) -const ( - TypeCloudBrainOne int = iota - TypeCloudBrainTwo -) - type FileChunk struct { ID int64 `xorm:"pk autoincr"` UUID string `xorm:"uuid UNIQUE"` diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index b740b11675..e6eaa15e2a 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -54,7 +54,6 @@ const ( PerPage = 10 IsLatestVersion = "1" NotLatestVersion = "0" - DebugType = -1 VersionCount = 1 SortByCreateTime = "create_time" diff --git a/routers/admin/cloudbrains.go b/routers/admin/cloudbrains.go old mode 100644 new mode 100755 index 6bbd534b9b..884ed6b9b7 --- a/routers/admin/cloudbrains.go +++ b/routers/admin/cloudbrains.go @@ -41,7 +41,7 @@ func CloudBrains(ctx *context.Context) { if page <= 0 { page = 1 } - debugType := modelarts.DebugType + debugType := models.TypeCloudBrainAll if listType == models.GPUResource { debugType = models.TypeCloudBrainOne } else if listType == models.NPUResource { @@ -121,7 +121,7 @@ func DownloadCloudBrains(ctx *context.Context) { Page: page, PageSize: 1, }, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, NeedRepoInfo: false, IsLatestVersion: modelarts.IsLatestVersion, }) @@ -151,7 +151,7 @@ func DownloadCloudBrains(ctx *context.Context) { Page: page, PageSize: pageSize, }, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, NeedRepoInfo: true, IsLatestVersion: modelarts.IsLatestVersion, }) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 9c670e2037..26e09ec427 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -46,20 +46,26 @@ const ( ) func DebugJobIndex(ctx *context.Context) { - debugListType := ctx.Query("debugListType") - ctx.Data["ListType"] = debugListType + listType := ctx.Query("debugListType") + ctx.Data["ListType"] = listType MustEnableCloudbrain(ctx) repo := ctx.Repo.Repository page := ctx.QueryInt("page") if page <= 0 { page = 1 } - debugType := modelarts.DebugType + typeCloudBrain := models.TypeCloudBrainAll jobTypeNot := false - if debugListType == models.GPUResource { - debugType = models.TypeCloudBrainOne - } else if debugListType == models.NPUResource { - debugType = models.TypeCloudBrainTwo + if listType == models.GPUResource { + typeCloudBrain = models.TypeCloudBrainOne + } else if listType == models.NPUResource { + typeCloudBrain = models.TypeCloudBrainTwo + } else if listType == models.AllResource { + typeCloudBrain = models.TypeCloudBrainAll + } else { + log.Error("listType(%s) error", listType) + ctx.ServerError("listType error", errors.New("listType error")) + return } var jobTypes []string @@ -70,7 +76,7 @@ func DebugJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: debugType, + Type: typeCloudBrain, JobTypeNot: jobTypeNot, JobTypes: jobTypes, }) @@ -92,7 +98,7 @@ func DebugJobIndex(ctx *context.Context) { ctx.Data["Tasks"] = ciTasks ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx) ctx.Data["RepoIsEmpty"] = repo.IsEmpty - ctx.Data["debugListType"] = debugListType + ctx.Data["debugListType"] = listType ctx.HTML(200, tplDebugJobIndex) } @@ -473,6 +479,23 @@ func TrainJobIndex(ctx *context.Context) { page = 1 } + listType := ctx.Query("listType") + ctx.Data["ListType"] = listType + + typeCloudBrain := models.TypeCloudBrainAll + if listType == models.GPUResource { + typeCloudBrain = models.TypeCloudBrainOne + } else if listType == models.NPUResource { + typeCloudBrain = models.TypeCloudBrainTwo + } else if listType == models.AllResource { + typeCloudBrain = models.TypeCloudBrainAll + } + //else { + // log.Error("listType(%s) error", listType) + // ctx.ServerError("listType error", errors.New("listType error")) + // return + //} + var jobTypes []string jobTypes = append(jobTypes, string(models.JobTypeTrain)) tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{ @@ -481,7 +504,7 @@ func TrainJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, + Type: typeCloudBrain, JobTypeNot: false, JobTypes: jobTypes, IsLatestVersion: modelarts.IsLatestVersion, @@ -2246,7 +2269,7 @@ func SetJobCount(ctx *context.Context) { repoId := ctx.Repo.Repository.ID _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{ RepoID: repoId, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, }) if err != nil { ctx.ServerError("Get job faild:", err) -- 2.34.1 From 8dfb8761bfddae46115c61b14815f58d6a444e26 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 16 Mar 2022 18:12:10 +0800 Subject: [PATCH 02/31] create --- modules/auth/cloudbrain.go | 3 +++ routers/repo/cloudbrain.go | 38 +++++++++++++++++++++++++++++--------- routers/repo/modelarts.go | 6 +++++- routers/routes/routes.go | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 9949feddc1..9d3d6290f5 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -20,6 +20,9 @@ type CreateCloudBrainForm struct { ResourceSpecId int `form:"resource_spec_id" binding:"Required"` BenchmarkTypeID int `form:"benchmark_types_id"` BenchmarkChildTypeID int `form:"benchmark_child_types_id"` + BootFile string `form:"boot_file"` + Params string `form:"run_para_list"` + BranchName string `form:"branch_name"` } type CommitImageCloudBrainForm struct { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 6e88b266db..7125935aae 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -35,6 +35,8 @@ const ( tplCloudBrainBenchmarkIndex base.TplName = "repo/cloudbrain/benchmark/index" tplCloudBrainBenchmarkNew base.TplName = "repo/cloudbrain/benchmark/new" tplCloudBrainBenchmarkShow base.TplName = "repo/cloudbrain/benchmark/show" + + tplCloudBrainTrainJobNew base.TplName = "repo/cloudbrain/trainjob/new" ) var ( @@ -187,32 +189,37 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { resourceSpecId := form.ResourceSpecId repo := ctx.Repo.Repository - tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName) + tpl := tplCloudBrainNew + if jobType == string(models.JobTypeTrain) { + tpl = tplCloudBrainTrainJobNew + } + + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("the job name did already exist", tplCloudBrainNew, &form) + ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } } if !jobNamePattern.MatchString(displayJobName) { - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplCloudBrainNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) { log.Error("jobtype error:", jobType, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("jobtype error", tplCloudBrainNew, &form) + ctx.RenderWithErr("jobtype error", tpl, &form) return } @@ -220,13 +227,13 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { if err != nil { log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplCloudBrainNew, &form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } @@ -269,11 +276,15 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { 0, 0, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") + if jobType == string(models.JobTypeTrain) { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=all") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") + } } func CloudBrainRestart(ctx *context.Context) { @@ -1395,3 +1406,12 @@ func BenchmarkDel(ctx *context.Context) { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark") } } + +func CloudBrainTrainJobNew(ctx *context.Context) { + err := cloudBrainNewDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(http.StatusOK, tplCloudBrainTrainJobNew) +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 26e09ec427..88ed7c4b08 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -517,7 +517,11 @@ func TrainJobIndex(ctx *context.Context) { for i, task := range tasks { tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) - tasks[i].ComputeResource = models.NPUResource + if task.Cloudbrain.Type == models.TypeCloudBrainOne { + tasks[i].ComputeResource = models.GPUResource + } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo { + tasks[i].ComputeResource = models.NPUResource + } } pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 2d146c2c6c..5082448ad0 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1016,6 +1016,21 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainBenchmarkCreate) m.Get("/get_child_types", repo.GetChildTypes) }) + + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobDel) + m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) + //m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion) + //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.CloudBrainTrainJobNew) + m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate) + + //m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) + }) }, context.RepoRef()) m.Group("/modelmanage", func() { m.Post("/create_model", reqRepoModelManageWriter, repo.SaveModel) -- 2.34.1 From 8a395a68d5a6a77ada0fe64c3ac1b3c05feabcc3 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 17 Mar 2022 11:44:23 +0800 Subject: [PATCH 03/31] create --- modules/cloudbrain/cloudbrain.go | 2 ++ routers/repo/cloudbrain.go | 36 ++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 54ac0c7acf..dc3f483b73 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -32,6 +32,8 @@ const ( SubTaskName = "task1" Success = "S000" + + DefaultBranchName = "master" ) var ( diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 7125935aae..923ff09533 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -183,15 +183,24 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { image := form.Image uuid := form.Attachment jobType := form.JobType - command := cloudbrain.Command gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath resourceSpecId := form.ResourceSpecId + branchName := form.BranchName repo := ctx.Repo.Repository tpl := tplCloudBrainNew + command := cloudbrain.Command if jobType == string(models.JobTypeTrain) { tpl = tplCloudBrainTrainJobNew + command, err := getTrainJobCommand(form) + if err != nil { + log.Error("getTrainJobCommand failed: %v", err) + ctx.RenderWithErr(err.Error(), tpl, &form) + return + } + + log.Info("%s", command) } tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) @@ -216,7 +225,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { return } - if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) { + if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) && jobType != string(models.JobTypeTrain) { log.Error("jobtype error:", jobType, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr("jobtype error", tpl, &form) @@ -238,7 +247,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } } - downloadCode(repo, codePath) + if branchName == "" { + branchName = cloudbrain.DefaultBranchName + } + downloadCode(repo, codePath, branchName) uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/") modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" @@ -764,8 +776,8 @@ func GetRate(ctx *context.Context) { } } -func downloadCode(repo *models.Repository, codePath string) error { - if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil { +func downloadCode(repo *models.Repository, codePath, branchName string) error { + if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{Branch: branchName}); err != nil { log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) return err } @@ -1316,7 +1328,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF } os.RemoveAll(codePath) - if err := downloadCode(repo, codePath); err != nil { + if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil { log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) @@ -1415,3 +1427,15 @@ func CloudBrainTrainJobNew(ctx *context.Context) { } ctx.HTML(http.StatusOK, tplCloudBrainTrainJobNew) } + +func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { + var command string + bootFile := form.BootFile + + if !strings.HasSuffix(bootFile, ".py") { + log.Error("bootFile(%s) format error", bootFile) + return command, errors.New("bootFile format error") + } + + return command, nil +} -- 2.34.1 From bcae95e566eb8ef80957eef1e950c2e0910edcfe Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 17 Mar 2022 19:17:52 +0800 Subject: [PATCH 04/31] create --- models/action.go | 1 + modules/cloudbrain/cloudbrain.go | 33 +- modules/setting/setting.go | 30 +- routers/repo/cloudbrain.go | 35 +- templates/repo/cloudbrain/trainjob/new.tmpl | 427 ++++++++++++++++++++ 5 files changed, 501 insertions(+), 25 deletions(-) create mode 100755 templates/repo/cloudbrain/trainjob/new.tmpl diff --git a/models/action.go b/models/action.go index 2a9d88399b..9b92b4192d 100755 --- a/models/action.go +++ b/models/action.go @@ -57,6 +57,7 @@ const ( ActionCreateInferenceTask // 28 ActionCreateBenchMarkTask //29 ActionCreateNewModelTask //30 + ActionCreateGPUTrainTask //31 ) // Action represents user operation type and other information to diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index dc3f483b73..8b0786b57b 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -15,9 +15,7 @@ import ( ) const ( - Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple; - service ssh stop; - jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` + Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;set TEST1=1111;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` CodeMountPath = "/code" @@ -37,7 +35,8 @@ const ( ) var ( - ResourceSpecs *models.ResourceSpecs + ResourceSpecs *models.ResourceSpecs + TrainResourceSpecs *models.ResourceSpecs ) func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { @@ -157,12 +156,23 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid var resourceSpec *models.ResourceSpec - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if resourceSpecId == spec.Id { - resourceSpec = spec + if jobType == string(models.JobTypeDebug) { + if ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) + } + for _, spec := range ResourceSpecs.ResourceSpec { + if resourceSpecId == spec.Id { + resourceSpec = spec + } + } + } else if jobType == string(models.JobTypeTrain) { + if TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) + } + for _, spec := range TrainResourceSpecs.ResourceSpec { + if resourceSpecId == spec.Id { + resourceSpec = spec + } } } @@ -265,6 +275,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, BenchmarkTypeID: benchmarkTypeID, BenchmarkChildTypeID: benchmarkChildTypeID, Description: description, + IsLatestVersion: "1", }) if err != nil { @@ -280,6 +291,8 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, if string(models.JobTypeBenchmark) == jobType { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask) + } else if string(models.JobTypeTrain) == jobType { + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateGPUTrainTask) } else { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask) } diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 2a29dd700a..7ae2263f74 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -450,16 +450,18 @@ var ( DecompressOBSTaskName string //cloudbrain config - CBAuthUser string - CBAuthPassword string - RestServerHost string - JobPath string - CBCodePathPrefix string - JobType string - GpuTypes string - DebugServerHost string - ResourceSpecs string - MaxDuration int64 + CBAuthUser string + CBAuthPassword string + RestServerHost string + JobPath string + CBCodePathPrefix string + JobType string + GpuTypes string + DebugServerHost string + ResourceSpecs string + MaxDuration int64 + TrainGpuTypes string + TrainResourceSpecs string //benchmark config IsBenchmarkEnabled bool @@ -512,9 +514,9 @@ var ( ProfileID string PoolInfos string Flavor string - DebugHost string - ImageInfos string - Capacity int + DebugHost string + ImageInfos string + Capacity int //train-job ResourcePools string Engines string @@ -1283,6 +1285,8 @@ func NewContext() { GpuTypes = sec.Key("GPU_TYPES").MustString("") ResourceSpecs = sec.Key("RESOURCE_SPECS").MustString("") MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) + TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") + TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") sec = Cfg.Section("benchmark") IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 923ff09533..fb7909c8c9 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -45,6 +45,7 @@ var ( benchmarkTypes *models.BenchmarkTypes benchmarkGpuInfos *models.GpuInfos benchmarkResourceSpecs *models.ResourceSpecs + trainGpuInfos *models.GpuInfos ) var jobNamePattern = regexp.MustCompile(`^[a-z0-9][a-z0-9-_]{1,34}[a-z0-9-]$`) @@ -144,6 +145,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { } ctx.Data["gpu_types"] = gpuInfos.GpuInfo + if trainGpuInfos == nil { + json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) + } + ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo + if benchmarkGpuInfos == nil { json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) } @@ -158,6 +164,14 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) } ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec + + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled @@ -193,14 +207,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command := cloudbrain.Command if jobType == string(models.JobTypeTrain) { tpl = tplCloudBrainTrainJobNew - command, err := getTrainJobCommand(form) + commandTrain, err := getTrainJobCommand(form) if err != nil { log.Error("getTrainJobCommand failed: %v", err) ctx.RenderWithErr(err.Error(), tpl, &form) return } - log.Info("%s", command) + command = commandTrain } tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) @@ -1431,11 +1445,28 @@ func CloudBrainTrainJobNew(ctx *context.Context) { func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { var command string bootFile := form.BootFile + params := form.Params if !strings.HasSuffix(bootFile, ".py") { log.Error("bootFile(%s) format error", bootFile) return command, errors.New("bootFile format error") } + var parameters models.Parameters + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + return command, err + } + + for _, parameter := range parameters.Parameter { + command += "set " + parameter.Label + "=" + parameter.Value + ";" + } + } + + command += "python /code/" + bootFile + log.Info("command:" + command) + return command, nil } diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl new file mode 100755 index 0000000000..24c7332634 --- /dev/null +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -0,0 +1,427 @@ +{{template "base/head" .}} + + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+ {{template "base/alert" .}} +

+ {{.i18n.Tr "repo.modelarts.train_job.new"}} +

+
+ +
+ {{.CsrfTokenHtml}} + + + +

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
+ + + 请输入字母、数字、_和-,最长64个字符,且不能以中划线(-)结尾。 +
+ +
+ + +
+
+ +

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

+ + +
+ + +
+ + + +
+ + +
+ +
+ + + + + {{range .images}} + + {{end}} + {{range .public_images}} + + {{end}} + +
+ +
+ + {{if .bootFile}} + + {{else}} + + {{end}} + + + + 查看样例 +
+ +
+ + + 训练脚本存储在/code中,数据集存储在/dataset中,训练输出请存储在/model中以供后续下载。 +
+ +
+ + {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} + +
+ {{if ne 0 (len .params)}} + {{range $k ,$v := .params}} +
+
+ +
+
+ +
+ + + + +
+ {{end}} + {{end}} +
+
+ +
+ + +
+ +
+ + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+ + + +
+
+
+
+{{template "base/footer" .}} + + \ No newline at end of file -- 2.34.1 From 4968459aeb05d18b944a0977de95a786c15878be Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 18 Mar 2022 09:17:48 +0800 Subject: [PATCH 05/31] add param --- modules/cloudbrain/cloudbrain.go | 2 +- routers/repo/cloudbrain.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 8b0786b57b..2a3b52b85c 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -15,7 +15,7 @@ import ( ) const ( - Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;set TEST1=1111;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` + Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` CodeMountPath = "/code" diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index fb7909c8c9..3f2d60dc29 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -1461,7 +1461,7 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { } for _, parameter := range parameters.Parameter { - command += "set " + parameter.Label + "=" + parameter.Value + ";" + command += "export " + parameter.Label + "=" + parameter.Value + ";" } } -- 2.34.1 From 01ba9c033c102db5966470fe6a0c1e463b0c4e66 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 18 Mar 2022 10:20:14 +0800 Subject: [PATCH 06/31] del&stop --- modules/cloudbrain/cloudbrain.go | 3 +++ routers/repo/cloudbrain.go | 16 ++++++++++++++++ routers/routes/routes.go | 11 +++++------ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 2a3b52b85c..b843bb497e 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -156,6 +156,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid var resourceSpec *models.ResourceSpec + var versionCount int if jobType == string(models.JobTypeDebug) { if ResourceSpecs == nil { json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) @@ -166,6 +167,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, } } } else if jobType == string(models.JobTypeTrain) { + versionCount = 1 if TrainResourceSpecs == nil { json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) } @@ -276,6 +278,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, BenchmarkChildTypeID: benchmarkChildTypeID, Description: description, IsLatestVersion: "1", + VersionCount: versionCount, }) if err != nil { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 3f2d60dc29..21f5a0fceb 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -1470,3 +1470,19 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { return command, nil } + +func CloudBrainTrainJobDel(ctx *context.Context) { + var listType = ctx.Query("listType") + if err := deleteCloudbrainJob(ctx); err != nil { + log.Error("deleteCloudbrainJob failed: %v", err, ctx.Data["msgID"]) + ctx.ServerError(err.Error(), err) + return + } + + var isAdminPage = ctx.Query("isadminpage") + if ctx.IsUserSiteAdmin() && isAdminPage == "true" { + ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?debugListType=" + listType) + } +} diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 5082448ad0..2a55890e8b 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1018,18 +1018,17 @@ func RegisterRoutes(m *macaron.Macaron) { }) m.Group("/train-job", func() { - m.Group("/:jobid", func() { + m.Group("/:id", func() { m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobStop) - m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobDel) - m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) + m.Get("/download_model", cloudbrain.AdminOrJobCreaterRight, repo.CloudBrainDownloadModel) //m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion) //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) }) m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.CloudBrainTrainJobNew) m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate) - - //m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) }) }, context.RepoRef()) m.Group("/modelmanage", func() { -- 2.34.1 From f56ea152230ea4835717c9f6108fcbd8f5e4dec6 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 18 Mar 2022 11:55:48 +0800 Subject: [PATCH 07/31] todo --- routers/routes/routes.go | 5 +++-- templates/repo/modelarts/trainjob/index.tmpl | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 2a55890e8b..e8b42796b2 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1018,12 +1018,13 @@ func RegisterRoutes(m *macaron.Macaron) { }) m.Group("/train-job", func() { - m.Group("/:id", func() { + m.Group("/:jobid", func() { + //todo: jobid to id m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainStop) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) - m.Get("/download_model", cloudbrain.AdminOrJobCreaterRight, repo.CloudBrainDownloadModel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) //m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion) //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) }) diff --git a/templates/repo/modelarts/trainjob/index.tmpl b/templates/repo/modelarts/trainjob/index.tmpl index ed94c0598e..1820465023 100755 --- a/templates/repo/modelarts/trainjob/index.tmpl +++ b/templates/repo/modelarts/trainjob/index.tmpl @@ -143,7 +143,7 @@
{{$.CsrfTokenHtml}} {{if .CanDel}} - + {{$.i18n.Tr "repo.stop"}} {{else}} -- 2.34.1 From 0331f5375daab1694001a9523ab52f90ea8013a0 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 18 Mar 2022 17:22:32 +0800 Subject: [PATCH 08/31] sync status --- models/cloudbrain.go | 3 +- routers/api/v1/repo/modelarts.go | 64 +- templates/repo/cloudbrain/trainjob/show.tmpl | 872 +++++++++++++++++++ 3 files changed, 923 insertions(+), 16 deletions(-) create mode 100755 templates/repo/cloudbrain/trainjob/show.tmpl diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 96b827994e..16bc68eb11 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -20,9 +20,10 @@ type JobType string type ModelArtsJobStatus string const ( - TypeCloudBrainAll = -1 TypeCloudBrainOne int = iota TypeCloudBrainTwo + + TypeCloudBrainAll = -1 ) const ( diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 893f2a32c7..25c28ae200 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -6,6 +6,7 @@ package repo import ( + "code.gitea.io/gitea/modules/cloudbrain" "net/http" "strconv" "strings" @@ -128,26 +129,59 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { ctx.NotFound(err) return } - result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) - if err != nil { - ctx.NotFound(err) - return - } - job.Status = modelarts.TransTrainJobStatus(result.IntStatus) - job.Duration = result.Duration - job.TrainJobDuration = result.TrainJobDuration + if job.Type == models.TypeCloudBrainOne { + jobResult, err := cloudbrain.GetJob(job.JobID) + if err != nil { + ctx.NotFound(err) + log.Error("GetJob failed:", err) + return + } + result, err := models.ConvertToJobResultPayload(jobResult.Payload) + if err != nil { + ctx.NotFound(err) + log.Error("ConvertToJobResultPayload failed:", err) + return + } - if result.Duration != 0 { - job.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000) + job.Status = result.JobStatus.State + if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { + taskRoles := result.TaskRoles + taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) + + job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP + job.ContainerID = taskRes.TaskStatuses[0].ContainerID + job.Status = taskRes.TaskStatuses[0].State + } + if result.JobStatus.State != string(models.JobWaiting) { + err = models.UpdateJob(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } + } } else { - job.TrainJobDuration = "00:00:00" - } + result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) + if err != nil { + ctx.NotFound(err) + return + } - err = models.UpdateTrainJobVersion(job) - if err != nil { - log.Error("UpdateJob failed:", err) + job.Status = modelarts.TransTrainJobStatus(result.IntStatus) + job.Duration = result.Duration + job.TrainJobDuration = result.TrainJobDuration + + if result.Duration != 0 { + job.TrainJobDuration = util.AddZero(result.Duration/3600000) + ":" + util.AddZero(result.Duration%3600000/60000) + ":" + util.AddZero(result.Duration%60000/1000) + + } else { + job.TrainJobDuration = "00:00:00" + } + + err = models.UpdateTrainJobVersion(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } } ctx.JSON(http.StatusOK, map[string]interface{}{ diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl new file mode 100755 index 0000000000..bb6039c97a --- /dev/null +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -0,0 +1,872 @@ +{{template "base/head" .}} + +
+
+
+
+
+
+
+
+
+
+{{template "repo/header" .}} +
+

+ +

+ {{range $k ,$v := .version_list_task}} +
+
+
+
+ + + +
+ {{$.CsrfTokenHtml}} + {{if and (.CanModify) (eq .Status "COMPLETED") ($.Permission.CanWrite $.UnitTypeModelManage) }} + {{$.i18n.Tr "repo.modelarts.create_model"}} + {{else}} + {{$.i18n.Tr "repo.modelarts.create_model"}} + {{end}} + + {{if .CanModify}} + {{$.i18n.Tr "repo.modelarts.modify"}} + {{else}} + {{$.i18n.Tr "repo.modelarts.modify"}} + {{end}} + + {{if .CanDel}} + {{$.i18n.Tr "repo.stop"}} + {{else}} + {{$.i18n.Tr "repo.stop"}} + {{end}} + + + {{if .CanDel}} + {{$.i18n.Tr "repo.delete"}} + {{else}} + {{$.i18n.Tr "repo.delete"}} + {{end}} +
+
+ + {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} + {{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}} + {{$.i18n.Tr "repo.modelarts.parent_version"}}:{{.PreVersionName}} + {{$.i18n.Tr "repo.modelarts.status"}}: + {{.Status}} + + {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}: + {{.TrainJobDuration}} + + +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ {{$.i18n.Tr "repo.cloudbrain_task"}} + +
+ {{.DisplayJobName}} +
+
+ {{$.i18n.Tr "repo.modelarts.status"}} + +
+ {{.Status}} +
+
+ {{$.i18n.Tr "repo.modelarts.run_version"}} + +
+ {{.VersionName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} + +
+ {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} + +
+ {{.TrainJobDuration}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + +
+ {{.FlavorName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.compute_node"}} + +
+ {{.WorkServerNumber}} +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ {{$.i18n.Tr "repo.modelarts.train_job.AI_driver"}} + +
+ {{.EngineName}} +
+
+ {{$.i18n.Tr "repo.modelarts.code_version"}} + +
+ {{.BranchName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + +
+ {{.BootFile}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} + +
+ {{.DatasetName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + +
+ {{.Parameters}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.description"}} + +
+ {{.Cloudbrain.Description}} +
+
+
+
+
+ +
+
+
+
+ +
+ + +

+                            
+ +
+ +
+
+ + + +
+ +
+
+ +
+
+
+ {{end}} {{template "base/paginate" .}} +
+ +
+ +
+ +
+ +
+
+{{template "base/footer" .}} + + \ No newline at end of file -- 2.34.1 From 2ead5ccbb69da4b811596909cce3036f1b1caa83 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 18 Mar 2022 18:55:51 +0800 Subject: [PATCH 09/31] opt --- modules/cloudbrain/cloudbrain.go | 18 +- routers/repo/cloudbrain.go | 24 +- routers/routes/routes.go | 2 +- templates/repo/cloudbrain/trainjob/show.tmpl | 539 ++----------------- 4 files changed, 67 insertions(+), 516 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index b843bb497e..55044e7046 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -157,21 +157,21 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, var resourceSpec *models.ResourceSpec var versionCount int - if jobType == string(models.JobTypeDebug) { - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) + if jobType == string(models.JobTypeTrain) { + versionCount = 1 + if TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) } - for _, spec := range ResourceSpecs.ResourceSpec { + for _, spec := range TrainResourceSpecs.ResourceSpec { if resourceSpecId == spec.Id { resourceSpec = spec } } - } else if jobType == string(models.JobTypeTrain) { - versionCount = 1 - if TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) + } else { + if ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) } - for _, spec := range TrainResourceSpecs.ResourceSpec { + for _, spec := range ResourceSpecs.ResourceSpec { if resourceSpecId == spec.Id { resourceSpec = spec } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 21f5a0fceb..4aa186a914 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -36,7 +36,8 @@ const ( tplCloudBrainBenchmarkNew base.TplName = "repo/cloudbrain/benchmark/new" tplCloudBrainBenchmarkShow base.TplName = "repo/cloudbrain/benchmark/show" - tplCloudBrainTrainJobNew base.TplName = "repo/cloudbrain/trainjob/new" + tplCloudBrainTrainJobNew base.TplName = "repo/cloudbrain/trainjob/new" + tplCloudBrainTrainJobShow base.TplName = "repo/cloudbrain/trainjob/show" ) var ( @@ -383,18 +384,29 @@ func CloudBrainBenchMarkShow(ctx *context.Context) { return } } - cloudBrainShow(ctx, tplCloudBrainBenchmarkShow) + cloudBrainShow(ctx, tplCloudBrainBenchmarkShow, models.JobTypeBenchmark) } func CloudBrainShow(ctx *context.Context) { - cloudBrainShow(ctx, tplCloudBrainShow) + cloudBrainShow(ctx, tplCloudBrainShow, models.JobTypeDebug) } -func cloudBrainShow(ctx *context.Context, tpName base.TplName) { +func CloudBrainTrainJobShow(ctx *context.Context) { + cloudBrainShow(ctx, tplCloudBrainTrainJobShow, models.JobTypeTrain) +} + +func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.JobType) { ctx.Data["PageIsCloudBrain"] = true - var ID = ctx.Params(":id") debugListType := ctx.Query("debugListType") - task, err := models.GetCloudbrainByID(ID) + + var task *models.Cloudbrain + var err error + if jobType == models.JobTypeTrain { + task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) + } else { + task, err = models.GetCloudbrainByID(ctx.Params(":id")) + } + if err != nil { log.Info("error:" + err.Error()) ctx.Data["error"] = err.Error() diff --git a/routers/routes/routes.go b/routers/routes/routes.go index e8b42796b2..6ce28600ba 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1020,7 +1020,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/train-job", func() { m.Group("/:jobid", func() { //todo: jobid to id - m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainStop) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index bb6039c97a..10a3d544b4 100755 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -179,8 +179,8 @@ td, th { {{.i18n.Tr "repo.cloudbrain"}}
/
- - {{$.i18n.Tr "repo.modelarts.train_job"}} + + {{$.i18n.Tr "repo.modelarts.evaluate_job"}}
/
{{.displayJobName}}
@@ -188,51 +188,22 @@ td, th { {{range $k ,$v := .version_list_task}}
+
-
- {{$.CsrfTokenHtml}} - {{if and (.CanModify) (eq .Status "COMPLETED") ($.Permission.CanWrite $.UnitTypeModelManage) }} - {{$.i18n.Tr "repo.modelarts.create_model"}} - {{else}} - {{$.i18n.Tr "repo.modelarts.create_model"}} - {{end}} - - {{if .CanModify}} - {{$.i18n.Tr "repo.modelarts.modify"}} - {{else}} - {{$.i18n.Tr "repo.modelarts.modify"}} - {{end}} - - {{if .CanDel}} - {{$.i18n.Tr "repo.stop"}} - {{else}} - {{$.i18n.Tr "repo.stop"}} - {{end}} - - - {{if .CanDel}} - {{$.i18n.Tr "repo.delete"}} - {{else}} - {{$.i18n.Tr "repo.delete"}} - {{end}} -
- - {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} - {{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}} - {{$.i18n.Tr "repo.modelarts.parent_version"}}:{{.PreVersionName}} + {{TimeSinceUnix1 .CreatedUnix}} + {{$.i18n.Tr "repo.modelarts.status"}}: {{.Status}} {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}: - {{.TrainJobDuration}} - - + {{$.duration}} +
@@ -242,10 +213,8 @@ td, th {
@@ -275,17 +244,7 @@ td, th {
- - - {{$.i18n.Tr "repo.modelarts.run_version"}} - - - -
- {{.VersionName}} -
- - + {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} @@ -293,7 +252,7 @@ td, th {
- {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} + {{TimeSinceUnix1 .CreatedUnix}}
@@ -304,100 +263,61 @@ td, th {
- {{.TrainJobDuration}} + {{$.duration}}
- {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + 镜像 -
- {{.FlavorName}} +
+ {{.Image}}
- - - - {{$.i18n.Tr "repo.modelarts.train_job.compute_node"}} - - -
- {{.WorkServerNumber}} -
- - + +
+ - - - - - + + - - - - - - - - - - - + - + @@ -414,7 +334,7 @@ td, th { -
+

@@ -423,19 +343,7 @@ td, th {
                         
-
- - - -
- -
-
- + @@ -461,68 +369,14 @@ td, th { - -
- -
+ + {{template "base/footer" .}} \ No newline at end of file -- 2.34.1 From 8c20bb83ca0d9b8b357ee2bb7a10cedbc9a1ce9e Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 21 Mar 2022 15:29:55 +0800 Subject: [PATCH 10/31] show --- modules/cloudbrain/cloudbrain.go | 3 ++- routers/repo/cloudbrain.go | 4 ++-- templates/repo/cloudbrain/trainjob/show.tmpl | 12 ++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 55044e7046..2a42d1a97a 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -148,7 +148,7 @@ func AdminOrJobCreaterRightForTrain(ctx *context.Context) { } -func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error { +func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description, branchName string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error { dataActualPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.Attachment.Minio.BasePath + @@ -279,6 +279,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, Description: description, IsLatestVersion: "1", VersionCount: versionCount, + BranchName: branchName, }) if err != nil { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 4aa186a914..ba79eb6585 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -299,7 +299,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, form.Description, + storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, form.Description, branchName, 0, 0, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) @@ -1419,7 +1419,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, childInfo.Attachment, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, + storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, cloudbrain.DefaultBranchName, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index 10a3d544b4..660df28fb7 100755 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -298,6 +298,18 @@ td, th { + + + + + + + - +
- {{$.i18n.Tr "repo.modelarts.train_job.AI_driver"}} - -
- {{.EngineName}} -
-
- {{$.i18n.Tr "repo.modelarts.code_version"}} + {{$.i18n.Tr "repo.modelarts.train_job.description"}} -
- {{.BranchName}} +
+ {{.Description}}
- {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + {{$.i18n.Tr "repo.modelarts.train_job.standard"}}
- {{.BootFile}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} - -
- {{.DatasetName}} + {{$.resource_spec}}
- {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} - -
- {{.Parameters}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.description"}} + 创建者 -
- {{.Cloudbrain.Description}} +
+ {{.User.Name}}
+ {{$.i18n.Tr "repo.modelarts.code_version"}} + +
+ {{.BranchName}} +
+
{{$.i18n.Tr "repo.modelarts.train_job.standard"}} -- 2.34.1 From 82944c38ac01caa8268cb94bad09798db70a4a46 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 21 Mar 2022 17:38:19 +0800 Subject: [PATCH 11/31] show model --- routers/api/v1/api.go | 5 + routers/api/v1/repo/cloudbrain.go | 59 ++++++++- routers/api/v1/repo/modelarts.go | 5 +- templates/repo/cloudbrain/trainjob/show.tmpl | 129 +++++++++++++++++++ 4 files changed, 193 insertions(+), 5 deletions(-) diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 306854af37..b0ff7e9b80 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -882,6 +882,11 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/cloudbrain", func() { m.Get("/:id", repo.GetCloudbrainTask) m.Get("/:id/log", repo.CloudbrainGetLog) + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("/model_list", repo.CloudBrainModelList) + }) + }) }, reqRepoReader(models.UnitTypeCloudBrain)) m.Group("/modelarts", func() { m.Group("/notebook", func() { diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index f92259c3da..e0e229606d 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -6,15 +6,18 @@ package repo import ( + "encoding/json" "net/http" "sort" + "strings" "time" - "code.gitea.io/gitea/modules/log" - "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/storage" + routerRepo "code.gitea.io/gitea/routers/repo" ) // cloudbrain get job task by jobid @@ -152,3 +155,55 @@ func CloudbrainGetLog(ctx *context.Context) { return } + +func CloudBrainModelList(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + parentDir := ctx.Query("parentDir") + dirArray := strings.Split(parentDir, "/") + + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + return + } + + //get dirs + dirs, err := routerRepo.GetModelDirs(task.JobName, parentDir) + if err != nil { + log.Error("GetModelDirs failed:%v", err.Error(), ctx.Data["msgID"]) + ctx.ServerError("GetModelDirs failed:", err) + return + } + + var fileInfos []storage.FileInfo + err = json.Unmarshal([]byte(dirs), &fileInfos) + if err != nil { + log.Error("json.Unmarshal failed:%v", err.Error(), ctx.Data["msgID"]) + ctx.ServerError("json.Unmarshal failed:", err) + return + } + + for i, fileInfo := range fileInfos { + temp, _ := time.Parse("2006-01-02 15:04:05", fileInfo.ModTime) + fileInfos[i].ModTime = temp.Local().Format("2006-01-02 15:04:05") + } + + sort.Slice(fileInfos, func(i, j int) bool { + return fileInfos[i].ModTime > fileInfos[j].ModTime + }) + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "VersionName": versionName, + "StatusOK": 0, + "Path": dirArray, + "Dirs": fileInfos, + "task": task, + "PageIsCloudBrain": true, + }) +} diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 25c28ae200..0f6f748277 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -6,18 +6,17 @@ package repo import ( - "code.gitea.io/gitea/modules/cloudbrain" "net/http" "strconv" "strings" - "code.gitea.io/gitea/modules/util" - "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/storage" + "code.gitea.io/gitea/modules/util" routerRepo "code.gitea.io/gitea/routers/repo" ) diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index 660df28fb7..b6a0eadc79 100755 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -215,6 +215,7 @@ td, th {
@@ -355,6 +356,19 @@ td, th {
+ +
+ + + +
+ +
+
@@ -419,5 +433,120 @@ td, th { document.getElementById("mask").style.display = "none" }); } + function loadModelFile(version_name,parents,filename,init){ + parents = parents || '' + filename = filename || '' + init = init || '' + $.get(`/api/v1/repos/${userName}/${repoPath}/cloudbrain/train-job/${jobID}/model_list?version_name=${version_name}&parentDir=${parents}`, (data) => { + $(`#dir_list${version_name}`).empty() + renderDir(data,version_name) + if(init==="init"){ + $(`input[name=model${version_name}]`).val("") + $(`input[name=modelback${version_name}]`).val(version_name) + $(`#file_breadcrumb${version_name}`).empty() + let htmlBread = "" + htmlBread += `
${version_name}
` + htmlBread += "
/
" + $(`#file_breadcrumb${version_name}`).append(htmlBread) + }else{ + renderBrend(version_name,parents,filename,init) + } + }).fail(function(err) { + console.log(err,version_name); + }); + + } + function renderBrend(version_name,parents,filename,init){ + if(init=="folder"){ + let htmlBrend = "" + let sectionName=$(`#file_breadcrumb${version_name} .active.section`).text() + let parents1 = $(`input[name=model${version_name}]`).val() + let filename1 = $(`input[name=modelback${version_name}]`).val() + if(parents1===""){ + $(`#file_breadcrumb${version_name} .active.section`).replaceWith(`${sectionName}`) + }else{ + $(`#file_breadcrumb${version_name} .active.section`).replaceWith(`${sectionName}`) + } + + htmlBrend += `
${filename}
` + htmlBrend += "
/
" + $(`#file_breadcrumb${version_name}`).append(htmlBrend) + $(`input[name=model${version_name}]`).val(parents) + $(`input[name=modelback${version_name}]`).val(filename) + }else{ + $(`input[name=model${version_name}]`).val(parents) + $(`input[name=modelback${version_name}]`).val(filename) + $(`#file_breadcrumb${version_name} a.section:contains(${filename})`).nextAll().remove() + $(`#file_breadcrumb${version_name} a.section:contains(${filename})`).replaceWith(`
${filename}
`) + $(`#file_breadcrumb${version_name} div.section:contains(${filename})`).append("
/
") + } + + } + function renderDir(data,version_name){ + let html="" + html += "
" + html += "
" + html += "
" + html += "
" + html += "" + html += '' + // html += "" + for(let i=0;i` + html += "" + data.Dirs[i].FileName + "" + }else{ + if(downlaodFlag){ + html += `` + } + else{ + html += `` + } + html += "" + data.Dirs[i].FileName + "" + } + html += '' + html += "" + html += "" + html += "" + + html += "" + html += "" + + } + html += "" + html += "
" + if(data.Dirs[i].IsDir){ + html += "" + }else{ + html += ""+ `${dirs_size}` + "" + } + + html += "" + html += "" + data.Dirs[i].ModTime + "" + html += "
" + html += "
" + html += "
" + html += "
" + html += "
" + $(`#dir_list${version_name}`).append(html) + } + function renderSize(value){ + if(null==value||value==''){ + return "0 Bytes"; + } + var unitArr = new Array("Bytes","KB","MB","GB","TB","PB","EB","ZB","YB"); + var index=0; + var srcsize = parseFloat(value); + index=Math.floor(Math.log(srcsize)/Math.log(1024)); + var size =srcsize/Math.pow(1024,index); + size=size.toFixed(0);//保留的小数位数 + return size+unitArr[index]; + } \ No newline at end of file -- 2.34.1 From 9e4a3de1e406c23e30875e3d58e486da100f9866 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 21 Mar 2022 19:16:47 +0800 Subject: [PATCH 12/31] show model --- routers/routes/routes.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 6ce28600ba..82081ad36a 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1019,11 +1019,10 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/train-job", func() { m.Group("/:jobid", func() { - //todo: jobid to id m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainStop) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) - m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) + //m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) //m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion) //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) -- 2.34.1 From 91e4910a6f2e6d658bd05faeb2fc649edd11ac34 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 22 Mar 2022 17:36:10 +0800 Subject: [PATCH 13/31] show job --- routers/repo/cloudbrain.go | 1 + templates/repo/cloudbrain/trainjob/show.tmpl | 71 +++++++++++++++++++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index ba79eb6585..f17ac40262 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -483,6 +483,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo version_list_task = append(version_list_task, task) ctx.Data["version_list_task"] = version_list_task ctx.Data["debugListType"] = debugListType + ctx.Data["canDownload"] = cloudbrain.CanDeleteJob(ctx, task) ctx.HTML(200, tpName) } diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index b6a0eadc79..aeb0c6b5e6 100755 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -179,8 +179,8 @@ td, th { {{.i18n.Tr "repo.cloudbrain"}}
/
- - {{$.i18n.Tr "repo.modelarts.evaluate_job"}} + + {{$.i18n.Tr "repo.modelarts.train_job"}}
/
{{.displayJobName}}
@@ -413,6 +413,8 @@ td, th { let userName let repoPath let jobName + let downlaodFlag = {{$.canDownload}} + let taskID = {{$.task.ID}} $(document).ready(function(){ let url = window.location.href; let urlArr = url.split('/') @@ -420,10 +422,20 @@ td, th { repoPath = urlArr.slice(-4)[0] jobName = urlArr.slice(-1)[0] }) + function stopBubbling(e) { + e = window.event || e; + if (e.stopPropagation) { + e.stopPropagation(); //阻止事件 冒泡传播 + } else { + e.cancelBubble = true; //ie兼容 + } + } + let timeid = window.setInterval(loadJobStatus, 30000); + $(document).ready(loadJobStatus); function loadLog(version_name){ document.getElementById("mask").style.display = "block" - $.get(`/api/v1/repos/${userName}/${repoPath}/cloudbrain/${jobName}/log?version_name=${version_name}&lines=50&order=asc`, (data) => { + $.get(`/api/v1/repos/${userName}/${repoPath}/cloudbrain/${taskID}/log?version_name=${version_name}&lines=50&order=asc`, (data) => { $('input[name=end_line]').val(data.EndLine) $('input[name=start_line]').val(data.StartLine) $(`#log_file${version_name}`).text(data.Content) @@ -548,5 +560,58 @@ td, th { size=size.toFixed(0);//保留的小数位数 return size+unitArr[index]; } + function loadJobStatus() { + $(".ui.accordion.border-according").each((index, job) => { + const jobID = job.dataset.jobid; + const repoPath = job.dataset.repopath; + const versionname = job.dataset.version + // ['IMAGE_FAILED','SUBMIT_FAILED','DELETE_FAILED','KILLED','COMPLETED','FAILED','CANCELED','LOST','START_FAILED'] + // if (job.textContent.trim() == 'IMAGE_FAILED' || job.textContent.trim() == 'SUBMIT_FAILED' || job.textContent.trim() == 'DELETE_FAILED' + // || job.textContent.trim() == 'KILLED' || job.textContent.trim() == 'COMPLETED' || job.textContent.trim() == 'FAILED' + // || job.textContent.trim() == 'CANCELED' || job.textContent.trim() == 'LOST') { + // return + // } + let status = $(`#${versionname}-status-span`).text() + + if(['IMAGE_FAILED','SUBMIT_FAILED','DELETE_FAILED','KILLED','COMPLETED','FAILED','CANCELED','LOST','START_FAILED','SUCCEEDED'].includes(status)){ + return + } + let stopArray=["KILLED","FAILED","START_FAILED","KILLING","COMPLETED","SUCCEEDED"] + $.get(`/api/v1/repos/${repoPath}/cloudbrain/${taskID}?version_name=${versionname}`, (data) => { + //$(`#${versionname}-duration-span`).text(data.JobDuration) + $(`#${versionname}-status-span span`).text(data.JobStatus) + $(`#${versionname}-status-span i`).attr("class",data.JobStatus) + // detail status and duration + //$('#'+versionname+'-duration').text(data.JobDuration) + $('#'+versionname+'-status').text(data.JobStatus) + if(stopArray.includes(data.JobStatus)){ + $('#'+versionname+'-stop').addClass('disabled') + } + if(data.JobStatus==="COMPLETED"){ + $('#'+versionname+'-create-model').removeClass('disabled').addClass('blue') + } + }).fail(function(err) { + console.log(err); + }); + }); + }; + + function refreshStatus(version_name){ + $.get(`/api/v1/repos/${userName}/${repoPath}/cloudbrain/${taskID}?version_name=${versionname}`,(data)=>{ + // header status and duration + //$(`#${version_name}-duration-span`).text(data.JobDuration) + $(`#${version_name}-status-span span`).text(data.JobStatus) + $(`#${version_name}-status-span i`).attr("class",data.JobStatus) + // detail status and duration + //$('#'+version_name+'-duration').text(data.JobDuration) + $('#'+version_name+'-status').text(data.JobStatus) + loadLog(version_name) + + + }).fail(function(err) { + console.log(err); + }); + stopBubbling(arguments.callee.caller.arguments[0]) + } \ No newline at end of file -- 2.34.1 From efdbc20ed18e9a3f7cfff885281182f0a2faf841 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 23 Mar 2022 11:48:35 +0800 Subject: [PATCH 14/31] download model --- routers/repo/cloudbrain.go | 1 - templates/repo/cloudbrain/trainjob/show.tmpl | 40 ++++++++++---------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 28ea5479ca..ce4889042b 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -1452,7 +1452,6 @@ func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { } command += "python /code/" + bootFile - log.Info("command:" + command) return command, nil } diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index aeb0c6b5e6..040ad612dc 100755 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -412,15 +412,16 @@ td, th { let userName let repoPath - let jobName + let jobID let downlaodFlag = {{$.canDownload}} let taskID = {{$.task.ID}} + let realJobName = {{$.task.JobName}} $(document).ready(function(){ let url = window.location.href; let urlArr = url.split('/') userName = urlArr.slice(-5)[0] repoPath = urlArr.slice(-4)[0] - jobName = urlArr.slice(-1)[0] + jobID = urlArr.slice(-1)[0] }) function stopBubbling(e) { e = window.event || e; @@ -449,23 +450,24 @@ td, th { parents = parents || '' filename = filename || '' init = init || '' + console.log("start") $.get(`/api/v1/repos/${userName}/${repoPath}/cloudbrain/train-job/${jobID}/model_list?version_name=${version_name}&parentDir=${parents}`, (data) => { - $(`#dir_list${version_name}`).empty() - renderDir(data,version_name) - if(init==="init"){ - $(`input[name=model${version_name}]`).val("") - $(`input[name=modelback${version_name}]`).val(version_name) - $(`#file_breadcrumb${version_name}`).empty() - let htmlBread = "" - htmlBread += `
${version_name}
` - htmlBread += "
/
" - $(`#file_breadcrumb${version_name}`).append(htmlBread) - }else{ - renderBrend(version_name,parents,filename,init) - } - }).fail(function(err) { - console.log(err,version_name); - }); + $(`#dir_list${version_name}`).empty() + renderDir(data,version_name) + if(init==="init"){ + $(`input[name=model${version_name}]`).val("") + $(`input[name=modelback${version_name}]`).val(version_name) + $(`#file_breadcrumb${version_name}`).empty() + let htmlBread = "" + htmlBread += `
${version_name}
` + htmlBread += "
/
" + $(`#file_breadcrumb${version_name}`).append(htmlBread) + }else{ + renderBrend(version_name,parents,filename,init) + } + }).fail(function(err) { + console.log(err,version_name); + }); } function renderBrend(version_name,parents,filename,init){ @@ -515,7 +517,7 @@ td, th { html += "" + data.Dirs[i].FileName + "" }else{ if(downlaodFlag){ - html += `` + html += `` } else{ html += `` -- 2.34.1 From ebdde739c01aa151e93f6bba1deda7b81df8fc5e Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 23 Mar 2022 16:24:15 +0800 Subject: [PATCH 15/31] fix issue --- templates/repo/modelarts/trainjob/new.tmpl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl index afa09cf296..a513db981e 100755 --- a/templates/repo/modelarts/trainjob/new.tmpl +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -78,6 +78,24 @@

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
-- 2.34.1 From a298a16664fe6c70c66ee080078dc0cf72fcc4dc Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Wed, 23 Mar 2022 17:44:17 +0800 Subject: [PATCH 16/31] fix issue --- templates/repo/cloudbrain/trainjob/new.tmpl | 46 +++++++++++++++----- templates/repo/modelarts/trainjob/index.tmpl | 4 +- templates/repo/modelarts/trainjob/new.tmpl | 4 +- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index 24c7332634..d1faebc98e 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -23,12 +23,16 @@ width: 80.7% !important; margin-left: 10px; } +.width806{ + width: 80.6% !important; + margin-left: -2px; +} .width85{ width: 85% !important; margin-left: 4.5rem !important; } .width81{ - margin-left: 1.5rem; + margin-left: 1.5rem !important; width: 81% !important; } @@ -78,10 +82,28 @@

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
+ + +
- 请输入字母、数字、_和-,最长64个字符,且不能以中划线(-)结尾。 + {{.i18n.Tr "cloudbrain.job_name_rule"}}
@@ -123,18 +145,18 @@
- {{range .train_gpu_types}} {{end}}
-
- - +
+ + - + {{range .images}} {{end}} @@ -157,9 +179,9 @@ 查看样例
-
- - {{range .attachments}} @@ -192,8 +214,8 @@
-
- +
+
- 镜像 + {{$.i18n.Tr "repo.modelarts.train_job.standard"}} -
- {{.Image}} +
+ {{$.resource_spec}}
@@ -286,19 +287,18 @@ td, th {
- - + - - + + - + - + + + + + + + + + + + + + -- 2.34.1 From bed5a6afc8eb5d1c56f0980a1e20e24a41611f14 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 25 Mar 2022 09:55:59 +0800 Subject: [PATCH 24/31] show resource type --- options/locale/locale_en-US.ini | 3 ++- routers/repo/cloudbrain.go | 1 + templates/repo/cloudbrain/trainjob/show.tmpl | 12 +++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 3ebe5a9b5a..93b34963f2 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -967,7 +967,8 @@ modelarts.train_job.parameter_value=Parameter Value modelarts.train_job.resource_setting=resource_setting modelarts.train_job.resource_setting_info=resource_setting_info modelarts.train_job.resource_pool=resource_pool -modelarts.train_job.resource_type=resource_type +modelarts.train_job.resource_type=Resource Type +modelarts.train_job.train_dataset=Train Dataset modelarts.train_job.standard=Standard modelarts.train_job.NAS_address=NAS Address modelarts.train_job.NAS_mount_path=NAS Mount Path diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index d575dacac3..a5e4179114 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -415,6 +415,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo jobRes.Resource.Memory = strings.ReplaceAll(jobRes.Resource.Memory, "Mi", "MB") spec := "GPU数:" + strconv.Itoa(jobRes.Resource.NvidiaComGpu) + ",CPU数:" + strconv.Itoa(jobRes.Resource.CPU) + ",内存(MB):" + jobRes.Resource.Memory ctx.Data["resource_spec"] = spec + ctx.Data["resource_type"] = jobRes.Config.GpuType taskRoles := jobRes.TaskRoles if jobRes.JobStatus.State != string(models.JobFailed) { diff --git a/templates/repo/cloudbrain/trainjob/show.tmpl b/templates/repo/cloudbrain/trainjob/show.tmpl index da8a0a65e9..18518f0152 100755 --- a/templates/repo/cloudbrain/trainjob/show.tmpl +++ b/templates/repo/cloudbrain/trainjob/show.tmpl @@ -268,7 +268,17 @@ td, th { - + + + + +
- {{$.i18n.Tr "repo.modelarts.train_job.description"}} + 镜像 -
- {{.Description}} +
+ {{.Image}}
{{$.i18n.Tr "repo.modelarts.code_version"}} @@ -313,24 +313,48 @@ td, th {
- {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + {{$.i18n.Tr "repo.modelarts.train_job.start_file"}}
- {{$.resource_spec}} + {{.BootFile}}
- 创建者 + {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} -
- {{.User.Name}} +
+ {{.DatasetName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + +
+ {{.Parameters}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.description"}} + +
+ {{.Description}}
+ {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} + +
+ {{$.resource_type}} +
+
{{$.i18n.Tr "repo.modelarts.train_job.standard"}} -- 2.34.1 From fa929aead8867c6456b60dc089be9b8e17e218fe Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 25 Mar 2022 17:24:33 +0800 Subject: [PATCH 25/31] fix bug --- modules/cloudbrain/cloudbrain.go | 2 +- options/locale/locale_en-US.ini | 3 ++- options/locale/locale_zh-CN.ini | 2 +- public/home/home.js | 2 ++ routers/repo/modelarts.go | 10 ++++++++-- templates/repo/cloudbrain/trainjob/new.tmpl | 6 +++--- templates/repo/modelarts/trainjob/index.tmpl | 1 + templates/user/dashboard/feeds.tmpl | 4 +++- 8 files changed, 21 insertions(+), 9 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 36c83e7fb6..1b0bef64ad 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -306,7 +306,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, if string(models.JobTypeBenchmark) == jobType { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask) } else if string(models.JobTypeTrain) == jobType { - notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateGPUTrainTask) + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, displayJobName, models.ActionCreateGPUTrainTask) } else { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask) } diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 93b34963f2..34f8ec3c7a 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -2747,10 +2747,11 @@ reject_pull_request = `suggested changes for %s#%[2]s` upload_dataset=`upload dataset %s` task_gpudebugjob=`created CPU/GPU type debugging task%s` task_npudebugjob=`created NPU type debugging task %s` -task_trainjob=`created training task%s` +task_nputrainjob=`created NPU training task%s` task_inferencejob=`created reasoning task %s` task_benchmark=`created profiling task %s` task_createmodel=`created new model %s` +task_gputrainjob=`created CPU/GPU training task%s` [tool] ago = %s ago diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 8f1adc4001..d0f246b4ac 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -2755,7 +2755,7 @@ task_nputrainjob=`创建了NPU类型训练任务 %s` task_benchmark=`创建了评测任务 %s` task_createmodel=`导入了新模型 %s` -task_gputrainjob=`创建了CPU/GPU类型训练任务 %s` +task_gputrainjob=`创建了CPU/GPU类型训练任务 %s` [tool] ago=%s前 diff --git a/public/home/home.js b/public/home/home.js index 237ede3e77..478c70f217 100755 --- a/public/home/home.js +++ b/public/home/home.js @@ -175,6 +175,8 @@ function getTaskLink(record){ re = re + "/cloudbrain/benchmark/" + record.Content; }else if(record.OpType == 30){ re = re + "/modelmanage/show_model_info?name=" + record.RefName; + }else if(record.OpType == 31){ + re = re + "/cloudbrain/train-job/" + record.Content; } re = encodeURI(re); return re; diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 47f160e069..ed7cfbe986 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -485,6 +485,9 @@ func TrainJobIndex(ctx *context.Context) { } listType := ctx.Query("listType") + if len(listType) == 0 { + listType = models.AllResource + } ctx.Data["ListType"] = listType typeCloudBrain := models.TypeCloudBrainAll @@ -531,6 +534,7 @@ func TrainJobIndex(ctx *context.Context) { pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) pager.SetDefaultParams(ctx) + pager.AddParam(ctx, "listType", "ListType") ctx.Data["Page"] = pager ctx.Data["PageIsCloudBrain"] = true @@ -1567,6 +1571,7 @@ func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *model func TrainJobDel(ctx *context.Context) { var jobID = ctx.Params(":jobid") + var listType = ctx.Query("listType") repo := ctx.Repo.Repository var jobTypes []string @@ -1608,12 +1613,13 @@ func TrainJobDel(ctx *context.Context) { if ctx.IsUserSiteAdmin() && isAdminPage == "true" { ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") } else { - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) } } func TrainJobStop(ctx *context.Context) { var jobID = ctx.Params(":jobid") + var listType = ctx.Query("listType") task := ctx.Cloudbrain _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) @@ -1623,7 +1629,7 @@ func TrainJobStop(ctx *context.Context) { return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) } func canUserCreateTrainJob(uid int64) (bool, error) { diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index d1faebc98e..5bd689911a 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -227,7 +227,7 @@ - {{.i18n.Tr "repo.cloudbrain.cancel"}} + {{.i18n.Tr "repo.cloudbrain.cancel"}} @@ -239,8 +239,8 @@ {{template "base/footer" .}}