你确认删除该任务么?此任务一旦删除不可恢复。
+From 647ff435e526d447e3c11e2100674391ff4cd52d Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 16 Mar 2022 14:18:23 +0800 Subject: [PATCH 01/31] index --- models/cloudbrain.go | 7 ++++++ models/file_chunk.go | 5 ---- modules/modelarts/modelarts.go | 1 - routers/admin/cloudbrains.go | 6 ++--- routers/repo/modelarts.go | 45 +++++++++++++++++++++++++--------- 5 files changed, 44 insertions(+), 20 deletions(-) mode change 100644 => 100755 routers/admin/cloudbrains.go diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 06c2e98b4f..96b827994e 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -19,9 +19,16 @@ type CloudbrainStatus string type JobType string type ModelArtsJobStatus string +const ( + TypeCloudBrainAll = -1 + TypeCloudBrainOne int = iota + TypeCloudBrainTwo +) + const ( NPUResource = "NPU" GPUResource = "CPU/GPU" + AllResource = "all" //notebook storage category EVSCategory = "EVS" diff --git a/models/file_chunk.go b/models/file_chunk.go index 76c926dc5f..0fc3a88794 100755 --- a/models/file_chunk.go +++ b/models/file_chunk.go @@ -13,11 +13,6 @@ const ( FileUploaded ) -const ( - TypeCloudBrainOne int = iota - TypeCloudBrainTwo -) - type FileChunk struct { ID int64 `xorm:"pk autoincr"` UUID string `xorm:"uuid UNIQUE"` diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index b740b11675..e6eaa15e2a 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -54,7 +54,6 @@ const ( PerPage = 10 IsLatestVersion = "1" NotLatestVersion = "0" - DebugType = -1 VersionCount = 1 SortByCreateTime = "create_time" diff --git a/routers/admin/cloudbrains.go b/routers/admin/cloudbrains.go old mode 100644 new mode 100755 index 6bbd534b9b..884ed6b9b7 --- a/routers/admin/cloudbrains.go +++ b/routers/admin/cloudbrains.go @@ -41,7 +41,7 @@ func CloudBrains(ctx *context.Context) { if page <= 0 { page = 1 } - debugType := modelarts.DebugType + debugType := models.TypeCloudBrainAll if listType == models.GPUResource { debugType = models.TypeCloudBrainOne } else if listType == models.NPUResource { @@ -121,7 +121,7 @@ func DownloadCloudBrains(ctx *context.Context) { Page: page, PageSize: 1, }, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, NeedRepoInfo: false, IsLatestVersion: modelarts.IsLatestVersion, }) @@ -151,7 +151,7 @@ func DownloadCloudBrains(ctx *context.Context) { Page: page, PageSize: pageSize, }, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, NeedRepoInfo: true, IsLatestVersion: modelarts.IsLatestVersion, }) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 9c670e2037..26e09ec427 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -46,20 +46,26 @@ const ( ) func DebugJobIndex(ctx *context.Context) { - debugListType := ctx.Query("debugListType") - ctx.Data["ListType"] = debugListType + listType := ctx.Query("debugListType") + ctx.Data["ListType"] = listType MustEnableCloudbrain(ctx) repo := ctx.Repo.Repository page := ctx.QueryInt("page") if page <= 0 { page = 1 } - debugType := modelarts.DebugType + typeCloudBrain := models.TypeCloudBrainAll jobTypeNot := false - if debugListType == models.GPUResource { - debugType = models.TypeCloudBrainOne - } else if debugListType == models.NPUResource { - debugType = models.TypeCloudBrainTwo + if listType == models.GPUResource { + typeCloudBrain = models.TypeCloudBrainOne + } else if listType == models.NPUResource { + typeCloudBrain = models.TypeCloudBrainTwo + } else if listType == models.AllResource { + typeCloudBrain = models.TypeCloudBrainAll + } else { + log.Error("listType(%s) error", listType) + ctx.ServerError("listType error", errors.New("listType error")) + return } var jobTypes []string @@ -70,7 +76,7 @@ func DebugJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: debugType, + Type: typeCloudBrain, JobTypeNot: jobTypeNot, JobTypes: jobTypes, }) @@ -92,7 +98,7 @@ func DebugJobIndex(ctx *context.Context) { ctx.Data["Tasks"] = ciTasks ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx) ctx.Data["RepoIsEmpty"] = repo.IsEmpty - ctx.Data["debugListType"] = debugListType + ctx.Data["debugListType"] = listType ctx.HTML(200, tplDebugJobIndex) } @@ -473,6 +479,23 @@ func TrainJobIndex(ctx *context.Context) { page = 1 } + listType := ctx.Query("listType") + ctx.Data["ListType"] = listType + + typeCloudBrain := models.TypeCloudBrainAll + if listType == models.GPUResource { + typeCloudBrain = models.TypeCloudBrainOne + } else if listType == models.NPUResource { + typeCloudBrain = models.TypeCloudBrainTwo + } else if listType == models.AllResource { + typeCloudBrain = models.TypeCloudBrainAll + } + //else { + // log.Error("listType(%s) error", listType) + // ctx.ServerError("listType error", errors.New("listType error")) + // return + //} + var jobTypes []string jobTypes = append(jobTypes, string(models.JobTypeTrain)) tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{ @@ -481,7 +504,7 @@ func TrainJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, + Type: typeCloudBrain, JobTypeNot: false, JobTypes: jobTypes, IsLatestVersion: modelarts.IsLatestVersion, @@ -2246,7 +2269,7 @@ func SetJobCount(ctx *context.Context) { repoId := ctx.Repo.Repository.ID _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{ RepoID: repoId, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, }) if err != nil { ctx.ServerError("Get job faild:", err) -- 2.34.1 From 8dfb8761bfddae46115c61b14815f58d6a444e26 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 16 Mar 2022 18:12:10 +0800 Subject: [PATCH 02/31] create --- modules/auth/cloudbrain.go | 3 +++ routers/repo/cloudbrain.go | 38 +++++++++++++++++++++++++++++--------- routers/repo/modelarts.go | 6 +++++- routers/routes/routes.go | 15 +++++++++++++++ 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 9949feddc1..9d3d6290f5 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -20,6 +20,9 @@ type CreateCloudBrainForm struct { ResourceSpecId int `form:"resource_spec_id" binding:"Required"` BenchmarkTypeID int `form:"benchmark_types_id"` BenchmarkChildTypeID int `form:"benchmark_child_types_id"` + BootFile string `form:"boot_file"` + Params string `form:"run_para_list"` + BranchName string `form:"branch_name"` } type CommitImageCloudBrainForm struct { diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 6e88b266db..7125935aae 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -35,6 +35,8 @@ const ( tplCloudBrainBenchmarkIndex base.TplName = "repo/cloudbrain/benchmark/index" tplCloudBrainBenchmarkNew base.TplName = "repo/cloudbrain/benchmark/new" tplCloudBrainBenchmarkShow base.TplName = "repo/cloudbrain/benchmark/show" + + tplCloudBrainTrainJobNew base.TplName = "repo/cloudbrain/trainjob/new" ) var ( @@ -187,32 +189,37 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { resourceSpecId := form.ResourceSpecId repo := ctx.Repo.Repository - tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName) + tpl := tplCloudBrainNew + if jobType == string(models.JobTypeTrain) { + tpl = tplCloudBrainTrainJobNew + } + + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("the job name did already exist", tplCloudBrainNew, &form) + ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } } if !jobNamePattern.MatchString(displayJobName) { - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplCloudBrainNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) { log.Error("jobtype error:", jobType, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("jobtype error", tplCloudBrainNew, &form) + ctx.RenderWithErr("jobtype error", tpl, &form) return } @@ -220,13 +227,13 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { if err != nil { log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplCloudBrainNew, &form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } @@ -269,11 +276,15 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { 0, 0, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") + if jobType == string(models.JobTypeTrain) { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=all") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") + } } func CloudBrainRestart(ctx *context.Context) { @@ -1395,3 +1406,12 @@ func BenchmarkDel(ctx *context.Context) { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark") } } + +func CloudBrainTrainJobNew(ctx *context.Context) { + err := cloudBrainNewDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(http.StatusOK, tplCloudBrainTrainJobNew) +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 26e09ec427..88ed7c4b08 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -517,7 +517,11 @@ func TrainJobIndex(ctx *context.Context) { for i, task := range tasks { tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) - tasks[i].ComputeResource = models.NPUResource + if task.Cloudbrain.Type == models.TypeCloudBrainOne { + tasks[i].ComputeResource = models.GPUResource + } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo { + tasks[i].ComputeResource = models.NPUResource + } } pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 2d146c2c6c..5082448ad0 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1016,6 +1016,21 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainBenchmarkCreate) m.Get("/get_child_types", repo.GetChildTypes) }) + + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobDel) + m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) + //m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion) + //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.CloudBrainTrainJobNew) + m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate) + + //m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) + }) }, context.RepoRef()) m.Group("/modelmanage", func() { m.Post("/create_model", reqRepoModelManageWriter, repo.SaveModel) -- 2.34.1 From 8a395a68d5a6a77ada0fe64c3ac1b3c05feabcc3 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 17 Mar 2022 11:44:23 +0800 Subject: [PATCH 03/31] create --- modules/cloudbrain/cloudbrain.go | 2 ++ routers/repo/cloudbrain.go | 36 ++++++++++++++++++++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 54ac0c7acf..dc3f483b73 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -32,6 +32,8 @@ const ( SubTaskName = "task1" Success = "S000" + + DefaultBranchName = "master" ) var ( diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 7125935aae..923ff09533 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -183,15 +183,24 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { image := form.Image uuid := form.Attachment jobType := form.JobType - command := cloudbrain.Command gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath resourceSpecId := form.ResourceSpecId + branchName := form.BranchName repo := ctx.Repo.Repository tpl := tplCloudBrainNew + command := cloudbrain.Command if jobType == string(models.JobTypeTrain) { tpl = tplCloudBrainTrainJobNew + command, err := getTrainJobCommand(form) + if err != nil { + log.Error("getTrainJobCommand failed: %v", err) + ctx.RenderWithErr(err.Error(), tpl, &form) + return + } + + log.Info("%s", command) } tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) @@ -216,7 +225,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { return } - if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) { + if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) && jobType != string(models.JobTypeTrain) { log.Error("jobtype error:", jobType, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr("jobtype error", tpl, &form) @@ -238,7 +247,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { } } - downloadCode(repo, codePath) + if branchName == "" { + branchName = cloudbrain.DefaultBranchName + } + downloadCode(repo, codePath, branchName) uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/") modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" @@ -764,8 +776,8 @@ func GetRate(ctx *context.Context) { } } -func downloadCode(repo *models.Repository, codePath string) error { - if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil { +func downloadCode(repo *models.Repository, codePath, branchName string) error { + if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{Branch: branchName}); err != nil { log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) return err } @@ -1316,7 +1328,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF } os.RemoveAll(codePath) - if err := downloadCode(repo, codePath); err != nil { + if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil { log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) @@ -1415,3 +1427,15 @@ func CloudBrainTrainJobNew(ctx *context.Context) { } ctx.HTML(http.StatusOK, tplCloudBrainTrainJobNew) } + +func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { + var command string + bootFile := form.BootFile + + if !strings.HasSuffix(bootFile, ".py") { + log.Error("bootFile(%s) format error", bootFile) + return command, errors.New("bootFile format error") + } + + return command, nil +} -- 2.34.1 From bcae95e566eb8ef80957eef1e950c2e0910edcfe Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 17 Mar 2022 19:17:52 +0800 Subject: [PATCH 04/31] create --- models/action.go | 1 + modules/cloudbrain/cloudbrain.go | 33 +- modules/setting/setting.go | 30 +- routers/repo/cloudbrain.go | 35 +- templates/repo/cloudbrain/trainjob/new.tmpl | 427 ++++++++++++++++++++ 5 files changed, 501 insertions(+), 25 deletions(-) create mode 100755 templates/repo/cloudbrain/trainjob/new.tmpl diff --git a/models/action.go b/models/action.go index 2a9d88399b..9b92b4192d 100755 --- a/models/action.go +++ b/models/action.go @@ -57,6 +57,7 @@ const ( ActionCreateInferenceTask // 28 ActionCreateBenchMarkTask //29 ActionCreateNewModelTask //30 + ActionCreateGPUTrainTask //31 ) // Action represents user operation type and other information to diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index dc3f483b73..8b0786b57b 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -15,9 +15,7 @@ import ( ) const ( - Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple; - service ssh stop; - jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` + Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;set TEST1=1111;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` CodeMountPath = "/code" @@ -37,7 +35,8 @@ const ( ) var ( - ResourceSpecs *models.ResourceSpecs + ResourceSpecs *models.ResourceSpecs + TrainResourceSpecs *models.ResourceSpecs ) func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { @@ -157,12 +156,23 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid var resourceSpec *models.ResourceSpec - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if resourceSpecId == spec.Id { - resourceSpec = spec + if jobType == string(models.JobTypeDebug) { + if ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) + } + for _, spec := range ResourceSpecs.ResourceSpec { + if resourceSpecId == spec.Id { + resourceSpec = spec + } + } + } else if jobType == string(models.JobTypeTrain) { + if TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) + } + for _, spec := range TrainResourceSpecs.ResourceSpec { + if resourceSpecId == spec.Id { + resourceSpec = spec + } } } @@ -265,6 +275,7 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, BenchmarkTypeID: benchmarkTypeID, BenchmarkChildTypeID: benchmarkChildTypeID, Description: description, + IsLatestVersion: "1", }) if err != nil { @@ -280,6 +291,8 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, if string(models.JobTypeBenchmark) == jobType { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask) + } else if string(models.JobTypeTrain) == jobType { + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateGPUTrainTask) } else { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask) } diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 2a29dd700a..7ae2263f74 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -450,16 +450,18 @@ var ( DecompressOBSTaskName string //cloudbrain config - CBAuthUser string - CBAuthPassword string - RestServerHost string - JobPath string - CBCodePathPrefix string - JobType string - GpuTypes string - DebugServerHost string - ResourceSpecs string - MaxDuration int64 + CBAuthUser string + CBAuthPassword string + RestServerHost string + JobPath string + CBCodePathPrefix string + JobType string + GpuTypes string + DebugServerHost string + ResourceSpecs string + MaxDuration int64 + TrainGpuTypes string + TrainResourceSpecs string //benchmark config IsBenchmarkEnabled bool @@ -512,9 +514,9 @@ var ( ProfileID string PoolInfos string Flavor string - DebugHost string - ImageInfos string - Capacity int + DebugHost string + ImageInfos string + Capacity int //train-job ResourcePools string Engines string @@ -1283,6 +1285,8 @@ func NewContext() { GpuTypes = sec.Key("GPU_TYPES").MustString("") ResourceSpecs = sec.Key("RESOURCE_SPECS").MustString("") MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) + TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") + TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") sec = Cfg.Section("benchmark") IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 923ff09533..fb7909c8c9 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -45,6 +45,7 @@ var ( benchmarkTypes *models.BenchmarkTypes benchmarkGpuInfos *models.GpuInfos benchmarkResourceSpecs *models.ResourceSpecs + trainGpuInfos *models.GpuInfos ) var jobNamePattern = regexp.MustCompile(`^[a-z0-9][a-z0-9-_]{1,34}[a-z0-9-]$`) @@ -144,6 +145,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { } ctx.Data["gpu_types"] = gpuInfos.GpuInfo + if trainGpuInfos == nil { + json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) + } + ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo + if benchmarkGpuInfos == nil { json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) } @@ -158,6 +164,14 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) } ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec + + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled @@ -193,14 +207,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command := cloudbrain.Command if jobType == string(models.JobTypeTrain) { tpl = tplCloudBrainTrainJobNew - command, err := getTrainJobCommand(form) + commandTrain, err := getTrainJobCommand(form) if err != nil { log.Error("getTrainJobCommand failed: %v", err) ctx.RenderWithErr(err.Error(), tpl, &form) return } - log.Info("%s", command) + command = commandTrain } tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) @@ -1431,11 +1445,28 @@ func CloudBrainTrainJobNew(ctx *context.Context) { func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { var command string bootFile := form.BootFile + params := form.Params if !strings.HasSuffix(bootFile, ".py") { log.Error("bootFile(%s) format error", bootFile) return command, errors.New("bootFile format error") } + var parameters models.Parameters + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + return command, err + } + + for _, parameter := range parameters.Parameter { + command += "set " + parameter.Label + "=" + parameter.Value + ";" + } + } + + command += "python /code/" + bootFile + log.Info("command:" + command) + return command, nil } diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl new file mode 100755 index 0000000000..24c7332634 --- /dev/null +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -0,0 +1,427 @@ +{{template "base/head" .}} + + +