|
|
@@ -0,0 +1,724 @@ |
|
|
|
package repo |
|
|
|
|
|
|
|
import ( |
|
|
|
"code.gitea.io/gitea/modules/auth" |
|
|
|
"code.gitea.io/gitea/modules/git" |
|
|
|
"code.gitea.io/gitea/modules/grampus" |
|
|
|
"code.gitea.io/gitea/modules/modelarts" |
|
|
|
"code.gitea.io/gitea/modules/timeutil" |
|
|
|
"code.gitea.io/gitea/modules/util" |
|
|
|
"encoding/json" |
|
|
|
"errors" |
|
|
|
"github.com/unknwon/com" |
|
|
|
"io/ioutil" |
|
|
|
"net/http" |
|
|
|
"os" |
|
|
|
"path" |
|
|
|
"strconv" |
|
|
|
"strings" |
|
|
|
"time" |
|
|
|
|
|
|
|
"code.gitea.io/gitea/models" |
|
|
|
"code.gitea.io/gitea/modules/base" |
|
|
|
"code.gitea.io/gitea/modules/cloudbrain" |
|
|
|
"code.gitea.io/gitea/modules/context" |
|
|
|
"code.gitea.io/gitea/modules/log" |
|
|
|
"code.gitea.io/gitea/modules/setting" |
|
|
|
) |
|
|
|
|
|
|
|
const ( |
|
|
|
tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" |
|
|
|
|
|
|
|
//GPU |
|
|
|
tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" |
|
|
|
|
|
|
|
//NPU |
|
|
|
tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" |
|
|
|
) |
|
|
|
|
|
|
|
func GrampusTrainJobGPUNew(ctx *context.Context) { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainOne |
|
|
|
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
if err != nil { |
|
|
|
ctx.ServerError("get new train-job info failed", err) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew) |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobNPUNew(ctx *context.Context) { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainTwo |
|
|
|
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
if err != nil { |
|
|
|
ctx.ServerError("get new train-job info failed", err) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.HTML(200, tplGrampusTrainJobNPUNew) |
|
|
|
} |
|
|
|
|
|
|
|
func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error { |
|
|
|
ctx.Data["PageIsCloudBrain"] = true |
|
|
|
|
|
|
|
t := time.Now() |
|
|
|
var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] |
|
|
|
ctx.Data["display_job_name"] = displayJobName |
|
|
|
|
|
|
|
//get valid images |
|
|
|
images, err := grampus.GetImages(processType) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetImages failed:", err.Error()) |
|
|
|
} else { |
|
|
|
ctx.Data["images"] = images.Infos |
|
|
|
} |
|
|
|
|
|
|
|
//get valid resource specs |
|
|
|
specs, err := grampus.GetResourceSpecs(processType) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetResourceSpecs failed:", err.Error()) |
|
|
|
} else { |
|
|
|
ctx.Data["flavor_infos"] = specs.Infos |
|
|
|
} |
|
|
|
|
|
|
|
//get branches |
|
|
|
branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetBranches error:", err.Error()) |
|
|
|
} else { |
|
|
|
ctx.Data["branches"] = branches |
|
|
|
} |
|
|
|
|
|
|
|
ctx.Data["branchName"] = ctx.Repo.BranchName |
|
|
|
|
|
|
|
if processType == grampus.ProcessorTypeGPU { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainOne |
|
|
|
} else if processType == grampus.ProcessorTypeNPU { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainTwo |
|
|
|
} |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { |
|
|
|
if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") { |
|
|
|
log.Error("the boot file(%s) must be a python file", form.BootFile) |
|
|
|
return errors.New("启动文件必须是python文件") |
|
|
|
} |
|
|
|
|
|
|
|
if form.BranchName == "" { |
|
|
|
log.Error("the branch must not be null!", form.BranchName) |
|
|
|
return errors.New("代码分支不能为空!") |
|
|
|
} |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { |
|
|
|
displayJobName := form.DisplayJobName |
|
|
|
jobName := util.ConvertDisplayJobNameToJobName(displayJobName) |
|
|
|
uuid := form.Attachment |
|
|
|
description := form.Description |
|
|
|
bootFile := strings.TrimSpace(form.BootFile) |
|
|
|
params := form.Params |
|
|
|
repo := ctx.Repo.Repository |
|
|
|
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" |
|
|
|
codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" |
|
|
|
dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid |
|
|
|
branchName := form.BranchName |
|
|
|
flavorName := form.FlavorName |
|
|
|
image := strings.TrimSpace(form.Image) |
|
|
|
|
|
|
|
if !jobNamePattern.MatchString(displayJobName) { |
|
|
|
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//check count limit |
|
|
|
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.GPUResource) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} else { |
|
|
|
if count >= 1 { |
|
|
|
log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//check param |
|
|
|
if err := grampusParamCheckCreateTrainJob(form); err != nil { |
|
|
|
log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//check whether the task name in the project is duplicated |
|
|
|
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) |
|
|
|
if err == nil { |
|
|
|
if len(tasks) != 0 { |
|
|
|
log.Error("the job name did already exist", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} else { |
|
|
|
if !models.IsErrJobNotExist(err) { |
|
|
|
log.Error("system error, %v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//check dataset |
|
|
|
attachment, err := models.GetAttachmentByUUID(uuid) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//prepare code and out path |
|
|
|
_, err = ioutil.ReadDir(codeLocalPath) |
|
|
|
if err == nil { |
|
|
|
os.RemoveAll(codeLocalPath) |
|
|
|
} |
|
|
|
|
|
|
|
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { |
|
|
|
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//todo: upload code (send to file_server todo this work?) |
|
|
|
//upload code |
|
|
|
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" |
|
|
|
if err := mkModelPath(modelPath); err != nil { |
|
|
|
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//init model readme |
|
|
|
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//prepare command |
|
|
|
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) |
|
|
|
|
|
|
|
req := &grampus.GenerateTrainJobReq{ |
|
|
|
JobName: jobName, |
|
|
|
DisplayJobName: displayJobName, |
|
|
|
ComputeResource: models.GPUResource, |
|
|
|
Command: command, |
|
|
|
ResourceSpecId: form.FlavorID, |
|
|
|
ImageUrl: image, |
|
|
|
Description: description, |
|
|
|
BootFile: bootFile, |
|
|
|
Uuid: uuid, |
|
|
|
CommitID: commitID, |
|
|
|
BranchName: branchName, |
|
|
|
Params: form.Params, |
|
|
|
FlavorName: flavorName, |
|
|
|
EngineName: image, |
|
|
|
DatasetName: attachment.Name, |
|
|
|
IsLatestVersion: modelarts.IsLatestVersion, |
|
|
|
VersionCount: modelarts.VersionCount, |
|
|
|
WorkServerNumber: 1, |
|
|
|
} |
|
|
|
|
|
|
|
err = grampus.GenerateTrainJob(ctx, req) |
|
|
|
if err != nil { |
|
|
|
log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { |
|
|
|
displayJobName := form.DisplayJobName |
|
|
|
jobName := util.ConvertDisplayJobNameToJobName(displayJobName) |
|
|
|
uuid := form.Attachment |
|
|
|
description := form.Description |
|
|
|
bootFile := strings.TrimSpace(form.BootFile) |
|
|
|
params := form.Params |
|
|
|
repo := ctx.Repo.Repository |
|
|
|
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath |
|
|
|
codeObsPath := grampus.JobPath + jobName + modelarts.CodePath |
|
|
|
dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" |
|
|
|
branchName := form.BranchName |
|
|
|
isLatestVersion := modelarts.IsLatestVersion |
|
|
|
flavorName := form.FlavorName |
|
|
|
versionCount := modelarts.VersionCount |
|
|
|
engineName := form.EngineName |
|
|
|
|
|
|
|
if !jobNamePattern.MatchString(displayJobName) { |
|
|
|
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//check count limit |
|
|
|
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} else { |
|
|
|
if count >= 1 { |
|
|
|
log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//check param |
|
|
|
if err := grampusParamCheckCreateTrainJob(form); err != nil { |
|
|
|
log.Error("paramCheckCreateTrainJob failed:(%v)", err) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//check whether the task name in the project is duplicated |
|
|
|
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) |
|
|
|
if err == nil { |
|
|
|
if len(tasks) != 0 { |
|
|
|
log.Error("the job name did already exist", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} else { |
|
|
|
if !models.IsErrJobNotExist(err) { |
|
|
|
log.Error("system error, %v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//check dataset |
|
|
|
attachment, err := models.GetAttachmentByUUID(uuid) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//prepare code and out path |
|
|
|
_, err = ioutil.ReadDir(codeLocalPath) |
|
|
|
if err == nil { |
|
|
|
os.RemoveAll(codeLocalPath) |
|
|
|
} |
|
|
|
|
|
|
|
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { |
|
|
|
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//todo: upload code (send to file_server todo this work?) |
|
|
|
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { |
|
|
|
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
//prepare command |
|
|
|
command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) |
|
|
|
|
|
|
|
req := &grampus.GenerateTrainJobReq{ |
|
|
|
JobName: jobName, |
|
|
|
DisplayJobName: displayJobName, |
|
|
|
ComputeResource: models.NPUResource, |
|
|
|
Command: command, |
|
|
|
ResourceSpecId: form.FlavorID, |
|
|
|
ImageId: form.ImageID, |
|
|
|
DataUrl: dataObsPath, |
|
|
|
Description: description, |
|
|
|
CodeObsPath: codeObsPath, |
|
|
|
BootFileUrl: codeObsPath + bootFile, |
|
|
|
BootFile: bootFile, |
|
|
|
WorkServerNumber: form.WorkServerNumber, |
|
|
|
Uuid: uuid, |
|
|
|
CommitID: commitID, |
|
|
|
IsLatestVersion: isLatestVersion, |
|
|
|
BranchName: branchName, |
|
|
|
Params: form.Params, |
|
|
|
FlavorName: flavorName, |
|
|
|
EngineName: engineName, |
|
|
|
VersionCount: versionCount, |
|
|
|
TotalVersionCount: modelarts.TotalVersionCount, |
|
|
|
DatasetName: attachment.Name, |
|
|
|
} |
|
|
|
|
|
|
|
err = grampus.GenerateTrainJob(ctx, req) |
|
|
|
if err != nil { |
|
|
|
log.Error("GenerateTrainJob failed:%v", err.Error()) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusStopJob(ctx *context.Context) { |
|
|
|
var ID = ctx.Params(":jobid") |
|
|
|
var resultCode = "0" |
|
|
|
var errorMsg = "" |
|
|
|
var status = "" |
|
|
|
|
|
|
|
task := ctx.Cloudbrain |
|
|
|
for { |
|
|
|
if task.Status == string(models.GrampusStatusStopped) || task.Status == string(models.GrampusStatusFailed) || task.Status == string(models.GrampusStatusSucceeded) { |
|
|
|
log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) |
|
|
|
resultCode = "-1" |
|
|
|
errorMsg = "system error" |
|
|
|
break |
|
|
|
} |
|
|
|
|
|
|
|
res, err := grampus.StopJob(task.JobID) |
|
|
|
if err != nil { |
|
|
|
log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) |
|
|
|
resultCode = strconv.Itoa(res.ErrorCode) |
|
|
|
errorMsg = res.ErrorMsg |
|
|
|
break |
|
|
|
} |
|
|
|
|
|
|
|
task.Status = string(models.GrampusStatusStopped) |
|
|
|
if task.EndTime == 0 { |
|
|
|
task.EndTime = timeutil.TimeStampNow() |
|
|
|
} |
|
|
|
task.ComputeAndSetDuration() |
|
|
|
err = models.UpdateJob(task) |
|
|
|
if err != nil { |
|
|
|
log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) |
|
|
|
resultCode = "-1" |
|
|
|
errorMsg = "system error" |
|
|
|
break |
|
|
|
} |
|
|
|
|
|
|
|
status = task.Status |
|
|
|
break |
|
|
|
} |
|
|
|
|
|
|
|
ctx.JSON(200, map[string]interface{}{ |
|
|
|
"result_code": resultCode, |
|
|
|
"error_msg": errorMsg, |
|
|
|
"status": status, |
|
|
|
"id": ID, |
|
|
|
"StatusOK": 0, |
|
|
|
}) |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobDel(ctx *context.Context) { |
|
|
|
var listType = ctx.Query("listType") |
|
|
|
if err := deleteGrampusJob(ctx); err != nil { |
|
|
|
log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"]) |
|
|
|
ctx.ServerError(err.Error(), err) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
var isAdminPage = ctx.Query("isadminpage") |
|
|
|
var isHomePage = ctx.Query("ishomepage") |
|
|
|
if ctx.IsUserSiteAdmin() && isAdminPage == "true" { |
|
|
|
ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") |
|
|
|
} else if isHomePage == "true" { |
|
|
|
ctx.Redirect(setting.AppSubURL + "/cloudbrains") |
|
|
|
} else { |
|
|
|
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func deleteGrampusJob(ctx *context.Context) error { |
|
|
|
task := ctx.Cloudbrain |
|
|
|
|
|
|
|
if task.Status != string(models.GrampusStatusStopped) && task.Status != string(models.GrampusStatusSucceeded) && task.Status != string(models.GrampusStatusFailed) { |
|
|
|
log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) |
|
|
|
return errors.New("the job has not been stopped") |
|
|
|
} |
|
|
|
|
|
|
|
err := models.DeleteJob(task) |
|
|
|
if err != nil { |
|
|
|
log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"]) |
|
|
|
return err |
|
|
|
} |
|
|
|
|
|
|
|
storageType := models.TypeCloudBrainOne |
|
|
|
if task.ComputeResource == models.NPUResource { |
|
|
|
storageType = models.TypeCloudBrainTwo |
|
|
|
} |
|
|
|
deleteJobStorage(task.JobName, storageType) |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusTrainJobShow(ctx *context.Context) { |
|
|
|
ctx.Data["PageIsCloudBrain"] = true |
|
|
|
|
|
|
|
var task *models.Cloudbrain |
|
|
|
task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid")) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetCloudbrainByJobID failed:" + err.Error()) |
|
|
|
ctx.ServerError("system error", err) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if task.DeletedAt.IsZero() { //normal record |
|
|
|
result, err := grampus.GetJob(task.JobID) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetJob failed:" + err.Error()) |
|
|
|
//ctx.ServerError("GetJob failed", err) |
|
|
|
//return |
|
|
|
} |
|
|
|
|
|
|
|
if result != nil { |
|
|
|
if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { |
|
|
|
task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] |
|
|
|
} |
|
|
|
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) |
|
|
|
if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning { |
|
|
|
task.Duration = result.JobInfo.RunSec |
|
|
|
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) |
|
|
|
|
|
|
|
if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { |
|
|
|
task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) |
|
|
|
} |
|
|
|
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { |
|
|
|
task.EndTime = task.StartTime.Add(task.Duration) |
|
|
|
} |
|
|
|
task.CorrectCreateUnix() |
|
|
|
err = models.UpdateJob(task) |
|
|
|
if err != nil { |
|
|
|
log.Error("UpdateJob failed:" + err.Error()) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if len(task.Parameters) > 0 { |
|
|
|
var parameters models.Parameters |
|
|
|
err := json.Unmarshal([]byte(task.Parameters), ¶meters) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) |
|
|
|
ctx.ServerError("system error", err) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if len(parameters.Parameter) > 0 { |
|
|
|
paramTemp := "" |
|
|
|
for _, Parameter := range parameters.Parameter { |
|
|
|
param := Parameter.Label + " = " + Parameter.Value + "; " |
|
|
|
paramTemp = paramTemp + param |
|
|
|
} |
|
|
|
task.Parameters = paramTemp[:len(paramTemp)-2] |
|
|
|
} else { |
|
|
|
task.Parameters = "" |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
taskList := make([]*models.Cloudbrain, 0) |
|
|
|
taskList = append(taskList, task) |
|
|
|
ctx.Data["version_list_task"] = taskList |
|
|
|
|
|
|
|
ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) |
|
|
|
ctx.Data["displayJobName"] = task.DisplayJobName |
|
|
|
|
|
|
|
aiCenterInfo := strings.Split(task.AiCenter, "+") |
|
|
|
if len(aiCenterInfo) == 2 { |
|
|
|
ctx.Data["ai_center"] = aiCenterInfo[1] |
|
|
|
} |
|
|
|
|
|
|
|
ctx.HTML(http.StatusOK, tplGrampusTrainJobShow) |
|
|
|
} |
|
|
|
|
|
|
|
func GrampusGetLog(ctx *context.Context) { |
|
|
|
jobID := ctx.Params(":jobid") |
|
|
|
job, err := models.GetCloudbrainByJobID(jobID) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) |
|
|
|
ctx.ServerError(err.Error(), err) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
content, err := grampus.GetTrainJobLog(job.JobID) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"]) |
|
|
|
ctx.ServerError(err.Error(), err) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
ctx.JSON(http.StatusOK, map[string]interface{}{ |
|
|
|
"JobName": job.JobName, |
|
|
|
"Content": content, |
|
|
|
}) |
|
|
|
|
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName string) (string, error) { |
|
|
|
var command string |
|
|
|
|
|
|
|
command += grampus.CommandPrepareScript |
|
|
|
//download code & dataset |
|
|
|
if processorType == grampus.ProcessorTypeNPU { |
|
|
|
commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " " + datasetName + ";" |
|
|
|
command += commandDownload |
|
|
|
} else if processorType == grampus.ProcessorTypeGPU { |
|
|
|
commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " " + datasetName + ";" |
|
|
|
command += commandDownload |
|
|
|
} |
|
|
|
|
|
|
|
//check download result |
|
|
|
commandCheckRes := "bash -c \"[[ $? -eq 0 ]] && exit 0 || exit -1;\";" |
|
|
|
command += commandCheckRes |
|
|
|
|
|
|
|
//unzip code & dataset |
|
|
|
toolUnzip := "unzip -q " |
|
|
|
if strings.HasSuffix(datasetName, ".tar.gz") { |
|
|
|
toolUnzip = "tar -zxvf " |
|
|
|
} |
|
|
|
commandUnzip := "cd /cache/code;unzip -q master.zip;echo \"start to unzip dataset\";cd /cache/dataset;" + toolUnzip + datasetName + ";" |
|
|
|
command += commandUnzip |
|
|
|
|
|
|
|
//check unzip result |
|
|
|
commandCheckRes = "bash -c \"[[ $? -eq 0 ]] && exit 0 || exit -1;\";" |
|
|
|
command += commandCheckRes |
|
|
|
|
|
|
|
command += "echo \"unzip finished;start to exec code;\";" |
|
|
|
|
|
|
|
//exec code |
|
|
|
var parameters models.Parameters |
|
|
|
var paramCode string |
|
|
|
param := make([]models.Parameter, 0) |
|
|
|
if len(paramSrc) != 0 { |
|
|
|
err := json.Unmarshal([]byte(paramSrc), ¶meters) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err) |
|
|
|
return command, err |
|
|
|
} |
|
|
|
|
|
|
|
for _, parameter := range parameters.Parameter { |
|
|
|
param = append(param, models.Parameter{ |
|
|
|
Label: parameter.Label, |
|
|
|
Value: parameter.Value, |
|
|
|
}) |
|
|
|
paramCode += " --" + parameter.Label + "=" + parameter.Value |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
commandCode := "cd /cache/code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";" |
|
|
|
command += commandCode |
|
|
|
|
|
|
|
//get exec result |
|
|
|
commandGetRes := "result=$?;" |
|
|
|
command += commandGetRes |
|
|
|
|
|
|
|
//upload models |
|
|
|
if processorType == grampus.ProcessorTypeNPU { |
|
|
|
commandUpload := "cd /cache/script_for_grampus/;./uploader_for_obs " + setting.Bucket + " " + outputRemotePath + " " + "/cache/output/;" |
|
|
|
command += commandUpload |
|
|
|
} else if processorType == grampus.ProcessorTypeGPU { |
|
|
|
commandUpload := "cd /cache/script_for_grampus/;./uploader_for_minio " + setting.Grampus.Env + " " + outputRemotePath + " " + "/cache/output/;" |
|
|
|
command += commandUpload |
|
|
|
} |
|
|
|
|
|
|
|
//check exec result |
|
|
|
commandCheckRes = "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1;\"" |
|
|
|
command += commandCheckRes |
|
|
|
|
|
|
|
return command, nil |
|
|
|
} |
|
|
|
|
|
|
|
func downloadZipCode(ctx *context.Context, codePath, branchName string) error { |
|
|
|
archiveType := git.ZIP |
|
|
|
archivePath := codePath |
|
|
|
|
|
|
|
if !com.IsDir(archivePath) { |
|
|
|
if err := os.MkdirAll(archivePath, os.ModePerm); err != nil { |
|
|
|
log.Error("MkdirAll failed:" + err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
// Get corresponding commit. |
|
|
|
var ( |
|
|
|
commit *git.Commit |
|
|
|
err error |
|
|
|
) |
|
|
|
|
|
|
|
gitRepo := ctx.Repo.GitRepo |
|
|
|
if err != nil { |
|
|
|
log.Error("OpenRepository failed:" + err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
|
|
|
|
if gitRepo.IsBranchExist(branchName) { |
|
|
|
commit, err = gitRepo.GetBranchCommit(branchName) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetBranchCommit failed:" + err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
archivePath = path.Join(archivePath, grampus.CodeArchiveName) |
|
|
|
if !com.IsFile(archivePath) { |
|
|
|
if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{ |
|
|
|
Format: archiveType, |
|
|
|
Prefix: setting.Repository.PrefixArchiveFiles, |
|
|
|
}); err != nil { |
|
|
|
log.Error("CreateArchive failed:" + err.Error()) |
|
|
|
return err |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return nil |
|
|
|
} |