From 00e8965d3864f9933b34ae73a2647ecbb0f91e25 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 19 May 2022 15:52:58 +0800 Subject: [PATCH 01/56] init --- routers/repo/grampus.go | 22 ++++++++++++++++++++++ routers/routes/routes.go | 12 ++++++++++++ 2 files changed, 34 insertions(+) create mode 100755 routers/repo/grampus.go diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go new file mode 100755 index 0000000000..3e8329dfbb --- /dev/null +++ b/routers/repo/grampus.go @@ -0,0 +1,22 @@ +package repo + +import ( + "code.gitea.io/gitea/modules/base" + "net/http" + + "code.gitea.io/gitea/modules/context" +) + +const ( + tplGrampusTrainJobNew base.TplName = "repo/grampus/trainjob/new" + tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" +) + +func GrampusNew(ctx *context.Context) { + err := cloudBrainNewDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(http.StatusOK, tplGrampusTrainJobNew) +} diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 4c3f5f472c..a64eb0fae8 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1083,6 +1083,18 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate) }) }, context.RepoRef()) + m.Group("/grampus", func() { + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusNew) + m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusCreate) + }) + }, context.RepoRef()) m.Group("/modelmanage", func() { m.Post("/create_model", reqRepoModelManageWriter, repo.SaveModel) m.Post("/create_new_model", repo.SaveNewNameModel) -- 2.34.1 From 126e07c14eb857e2bc1f3c6f3abedea0b0423a4e Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Fri, 20 May 2022 17:50:27 +0800 Subject: [PATCH 02/56] add config --- custom/conf/app.ini.sample | 6 + modules/grampus/grampus.go | 294 +++++ modules/grampus/resty.go | 1112 +++++++++++++++++ modules/setting/setting.go | 16 + routers/repo/grampus.go | 425 ++++++- routers/routes/routes.go | 26 +- templates/repo/grampus/trainjob/gpu/new.tmpl | 447 +++++++ templates/repo/grampus/trainjob/gpu/show.tmpl | 731 +++++++++++ templates/repo/grampus/trainjob/npu/new.tmpl | 475 +++++++ templates/repo/grampus/trainjob/npu/show.tmpl | 1008 +++++++++++++++ 10 files changed, 4527 insertions(+), 13 deletions(-) create mode 100755 modules/grampus/grampus.go create mode 100755 modules/grampus/resty.go create mode 100755 templates/repo/grampus/trainjob/gpu/new.tmpl create mode 100755 templates/repo/grampus/trainjob/gpu/show.tmpl create mode 100755 templates/repo/grampus/trainjob/npu/new.tmpl create mode 100755 templates/repo/grampus/trainjob/npu/show.tmpl diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample index d294c88235..7a4298f6bd 100755 --- a/custom/conf/app.ini.sample +++ b/custom/conf/app.ini.sample @@ -1141,3 +1141,9 @@ growth_issue=0.2 growth_contributors=0.2 growth_commit=0.2 growth_comments=0.2 + + +[grampus] +USERNAME = +PASSWORD = +SERVER_HOST = diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go new file mode 100755 index 0000000000..b60afb5cce --- /dev/null +++ b/modules/grampus/grampus.go @@ -0,0 +1,294 @@ +package grampus + +import ( + "code.gitea.io/gitea/modules/timeutil" + "strconv" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/notification" +) + +const ( + //notebook + storageTypeOBS = "obs" + autoStopDuration = 4 * 60 * 60 + autoStopDurationMs = 4 * 60 * 60 * 1000 + + DataSetMountPath = "/home/ma-user/work" + NotebookEnv = "Python3" + NotebookType = "Ascend" + FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" + + //train-job + // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" + // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" + // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + + // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + + // "]}" + // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + + // "]}" + CodePath = "/code/" + OutputPath = "/output/" + ResultPath = "/result/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 500 + TrainUrl = "train_url" + DataUrl = "data_url" + ResultUrl = "result_url" + CkptUrl = "ckpt_url" + DeviceTarget = "device_target" + Ascend = "Ascend" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" + VersionCount = 1 + + SortByCreateTime = "create_time" + ConfigTypeCustom = "custom" + TotalVersionCount = 1 +) + +var ( + poolInfos *models.PoolInfos + FlavorInfos *models.FlavorInfos + ImageInfos *models.ImageInfosModelArts +) + +type GenerateTrainJobReq struct { + JobName string + DisplayJobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + IsLatestVersion string + Params string + BranchName string + PreVersionId int64 + PreVersionName string + FlavorName string + VersionCount int + EngineName string + TotalVersionCount int +} + +type GenerateInferenceJobReq struct { + JobName string + DisplayJobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + Params string + BranchName string + FlavorName string + EngineName string + LabelName string + IsLatestVersion string + VersionCount int + TotalVersionCount int + ModelName string + ModelVersion string + CkptName string + ResultUrl string +} + +type VersionInfo struct { + Version []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"version"` +} + +type Flavor struct { + Info []struct { + Code string `json:"code"` + Value string `json:"value"` + } `json:"flavor"` +} + +type Engine struct { + Info []struct { + ID int `json:"id"` + Value string `json:"value"` + } `json:"engine"` +} + +type ResourcePool struct { + Info []struct { + ID string `json:"id"` + Value string `json:"value"` + } `json:"resource_pool"` +} + +// type Parameter struct { +// Label string `json:"label"` +// Value string `json:"value"` +// } + +// type Parameters struct { +// Parameter []Parameter `json:"parameter"` +// } + +type Parameters struct { + Parameter []struct { + Label string `json:"label"` + Value string `json:"value"` + } `json:"parameter"` +} + +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { + createTime := timeutil.TimeStampNow() + jobResult, err := createTrainJob(models.CreateTrainJobParams{ + JobName: req.JobName, + Description: req.Description, + Config: models.Config{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.BootFileUrl, + DataUrl: req.DataUrl, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + LogUrl: req.LogUrl, + PoolID: req.PoolID, + CreateVersion: true, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, + Parameter: req.Parameters, + }, + }) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + attach, err := models.GetAttachmentByUUID(req.Uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) + return err + } + jobId := strconv.FormatInt(jobResult.JobID, 10) + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobId, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: models.NPUResource, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + FlavorCode: req.FlavorCode, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, + CreatedUnix: createTime, + UpdatedUnix: createTime, + }) + + if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) + return err + } + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) + return nil +} + +func TransTrainJobStatus(status int) string { + switch status { + case 0: + return "UNKNOWN" + case 1: + return "INIT" + case 2: + return "IMAGE_CREATING" + case 3: + return "IMAGE_FAILED" + case 4: + return "SUBMIT_TRYING" + case 5: + return "SUBMIT_FAILED" + case 6: + return "DELETE_FAILED" + case 7: + return "WAITING" + case 8: + return "RUNNING" + case 9: + return "KILLING" + case 10: + return "COMPLETED" + case 11: + return "FAILED" + case 12: + return "KILLED" + case 13: + return "CANCELED" + case 14: + return "LOST" + case 15: + return "SCALING" + case 16: + return "SUBMIT_MODEL_FAILED" + case 17: + return "DEPLOY_SERVICE_FAILED" + case 18: + return "CHECK_INIT" + case 19: + return "CHECK_RUNNING" + case 20: + return "CHECK_RUNNING_COMPLETED" + case 21: + return "CHECK_FAILED" + + default: + return strconv.Itoa(status) + } +} diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go new file mode 100755 index 0000000000..693ba71a1d --- /dev/null +++ b/modules/grampus/resty.go @@ -0,0 +1,1112 @@ +package grampus + +import ( + "crypto/tls" + "encoding/json" + "fmt" + "net/http" + "strconv" + + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" + "github.com/go-resty/resty/v2" +) + +var ( + restyClient *resty.Client + HOST string + TOKEN string +) + +const ( + urlOpenApiV1 = "/openapi/v1/" + + urlGetToken = urlOpenApiV1 + "token" + urlNotebook = "/demanager/instances" + urlTrainJob = "/training-jobs" + urlResourceSpecs = "/job/resource-specs" + urlTrainJobConfig = "/training-job-configs" + errorCodeExceedLimit = "ModelArts.0118" + + urlNotebook2 = "" + + modelartsIllegalToken = "" +) + +type GetTokenParams struct { + UserName string `json:"user_name"` + Password string `json:"password"` +} + +type GetTokenResult struct { + Token string `json:"token"` + Expiration int64 `json:"expiration"` +} + +func getRestyClient() *resty.Client { + if restyClient == nil { + restyClient = resty.New() + restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true}) + } + return restyClient +} + +func checkSetting() { + if len(HOST) != 0 && len(TOKEN) != 0 && restyClient != nil { + return + } + + err := getToken() + if err != nil { + log.Error("getToken failed:%v", err) + } +} + +func getToken() error { + HOST = setting.Grampus.Host + + client := getRestyClient() + params := GetTokenParams{ + UserName: setting.Grampus.UserName, + Password: setting.Grampus.Password, + } + + var result GetTokenResult + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetBody(params). + SetResult(&result). + Post(HOST + urlGetToken) + if err != nil { + return fmt.Errorf("resty getToken: %v", err) + } + + if res.StatusCode() != http.StatusOK { + return fmt.Errorf("getToken failed:%s", res.String()) + } + + TOKEN = result.Token + log.Info(TOKEN) + + return nil +} + +func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateNotebookResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) + + if err != nil { + return nil, fmt.Errorf("resty create notebook: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == errorCodeExceedLimit { + response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" + } + return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func GetJob(jobID string) (*models.GetNotebookResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetNotebookResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return nil, fmt.Errorf("resty GetJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) { + checkSetting() + client := getRestyClient() + var result models.GetNotebook2Result + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + + if err != nil { + return nil, fmt.Errorf("resty GetJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookActionResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetBody(param). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action") + + if err != nil { + return &result, fmt.Errorf("resty StopJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookActionResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) + + if err != nil { + return &result, fmt.Errorf("resty ManageNotebook2: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelNotebook(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelNotebook2(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func DelJob(jobID string) (*models.NotebookDelResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookDelResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) { + checkSetting() + client := getRestyClient() + var result models.NotebookGetJobTokenResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token") + + if err != nil { + return &result, fmt.Errorf("resty GetJobToken: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} + +func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) + + if err != nil { + return nil, fmt.Errorf("resty create train-job: %s", err) + } + + req, _ := json.Marshal(createJobParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobVersionParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") + + if err != nil { + return nil, fmt.Errorf("resty create train-job version: %s", err) + } + + req, _ := json.Marshal(createJobVersionParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetResourceSpecsResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) + + if err != nil { + return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobConfigResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(req). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + //temp, _ := json.Marshal(req) + //log.Info("%s", temp) + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetConfigListResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "per_page": strconv.Itoa(perPage), + "page": strconv.Itoa(page), + "sortBy": sortBy, + "order": order, + "search_content": searchContent, + "config_type": configType, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) + + if err != nil { + return nil, fmt.Errorf("resty GetConfigList: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetParaConfig(configName, configType string) (models.GetConfigResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetConfigResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "config_type": configType, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName) + + if err != nil { + return result, fmt.Errorf("resty GetParaConfig: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return result, nil +} + +func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("获取作业详情失败") + } + + return &result, nil +} + +func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetQueryParams(map[string]string{ + "base_line": baseLine, + "lines": strconv.Itoa(lines), + "log_file": logFile, + "order": order, + }). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLog(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetTrainJobLogFileNamesResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names") + + if err != nil { + return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("GetTrainJobLogFileNames(%s) failed", jobID) + return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func DelTrainJob(jobID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) + + if err != nil { + return &result, fmt.Errorf("resty DelTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("DelTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop") + + if err != nil { + return &result, fmt.Errorf("resty StopTrainJob: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("StopTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + + if err != nil { + return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("DelTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg) + } + + return &result, nil +} + +func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) + + if err != nil { + return nil, fmt.Errorf("resty create inference-job: %s", err) + } + + req, _ := json.Marshal(createJobParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateNotebookResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) + + if err != nil { + return nil, fmt.Errorf("resty create notebook2: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + var response models.NotebookResult + err = json.Unmarshal(res.Body(), &response) + if err != nil { + log.Error("json.Unmarshal failed: %s", err.Error()) + return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) + } + + if len(response.ErrorCode) != 0 { + log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if response.ErrorCode == errorCodeExceedLimit { + response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" + } + if response.ErrorCode == modelartsIllegalToken && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) + } + + return &result, nil +} diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 5c87b68c5a..945a7c6f87 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -528,6 +528,13 @@ var ( FlavorInfos string TrainJobFLAVORINFOS string + //grampus config + Grampus = struct { + Host string + UserName string + Password string + }{} + //elk config ElkUrl string ElkUser string @@ -1382,6 +1389,15 @@ func NewContext() { Course.OrgName = sec.Key("org_name").MustString("") Course.TeamName = sec.Key("team_name").MustString("") + GetGrampusConfig() +} + +func GetGrampusConfig() { + sec := Cfg.Section("grampus") + + Grampus.Host = sec.Key("SERVER_HOST").MustString("") + Grampus.UserName = sec.Key("USERNAME").MustString("") + Grampus.Password = sec.Key("PASSWORD").MustString("") } func SetRadarMapConfig() { diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 3e8329dfbb..8f31896616 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1,22 +1,435 @@ package repo import ( - "code.gitea.io/gitea/modules/base" + "code.gitea.io/gitea/modules/auth" + "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/util" + "encoding/json" + "io/ioutil" "net/http" + "os" + "path" + "strconv" + "strings" + "time" + "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/base" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" ) const ( - tplGrampusTrainJobNew base.TplName = "repo/grampus/trainjob/new" - tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" + //GPU + tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" + tplGrampusTrainJobGPUShow base.TplName = "repo/grampus/trainjob/gpu/show" + + //NPU + tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" + tplGrampusTrainJobNPUShow base.TplName = "repo/grampus/trainjob/npu/show" ) -func GrampusNew(ctx *context.Context) { - err := cloudBrainNewDataPrepare(ctx) +func GrampusTrainJobGPUNew(ctx *context.Context) { + err := grampusGpuNewDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew) +} + +func grampusGpuNewDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + t := time.Now() + var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["display_job_name"] = displayJobName + + //get valid images + result, err := cloudbrain.GetImages() + if err != nil { + ctx.Data["error"] = err.Error() + log.Error("cloudbrain.GetImages failed:", err.Error(), ctx.Data["MsgID"]) + } + + for i, payload := range result.Payload.ImageInfo { + if strings.HasPrefix(result.Payload.ImageInfo[i].Place, "192.168") { + result.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)] + } else { + result.Payload.ImageInfo[i].PlaceView = payload.Place + } + } + + ctx.Data["images"] = result.Payload.ImageInfo + + resultPublic, err := cloudbrain.GetPublicImages() + if err != nil { + ctx.Data["error"] = err.Error() + log.Error("cloudbrain.GetPublicImages failed:", err.Error(), ctx.Data["MsgID"]) + } + + for i, payload := range resultPublic.Payload.ImageInfo { + if strings.HasPrefix(resultPublic.Payload.ImageInfo[i].Place, "192.168") { + resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)] + } else { + resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place + } + } + + ctx.Data["public_images"] = resultPublic.Payload.ImageInfo + + //get valid dataset + attachs, err := models.GetAllUserAttachments(ctx.User.ID) + if err != nil { + log.Error("GetAllUserAttachments failed: %v", err, ctx.Data["MsgID"]) + return err + } + + ctx.Data["attachments"] = attachs + ctx.Data["command"] = cloudbrain.Command + ctx.Data["code_path"] = cloudbrain.CodeMountPath + ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath + ctx.Data["model_path"] = cloudbrain.ModelMountPath + ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath + ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled + + //get valid resource specs + if categories == nil { + json.Unmarshal([]byte(setting.BenchmarkCategory), &categories) + } + ctx.Data["benchmark_categories"] = categories.Category + + ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType + + if gpuInfos == nil { + json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) + } + ctx.Data["gpu_types"] = gpuInfos.GpuInfo + + if trainGpuInfos == nil { + json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) + } + ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo + + if benchmarkGpuInfos == nil { + json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) + } + ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo + + if benchmarkResourceSpecs == nil { + json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs) + } + ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec + + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec + + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + + ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath + ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled + + ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath + ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled + + ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne + + ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode") + + return nil +} + +func GrampusTrainJobNPUNew(ctx *context.Context) { + err := trainJobNpuNewDataPrepare(ctx) if err != nil { ctx.ServerError("get new train-job info failed", err) return } - ctx.HTML(http.StatusOK, tplGrampusTrainJobNew) + ctx.HTML(200, tplGrampusTrainJobNPUNew) +} + +func trainJobNpuNewDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + + t := time.Now() + var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["display_job_name"] = displayJobName + + //get valid dataset + attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return err + } + ctx.Data["attachments"] = attachs + + //get valid resource specs + var resourcePools modelarts.ResourcePool + if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["resource_pools"] = resourcePools.Info + + var engines modelarts.Engine + if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engines"] = engines.Info + + var versionInfos modelarts.VersionInfo + if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engine_versions"] = versionInfos.Version + + var flavorInfos modelarts.Flavor + if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["flavor_infos"] = flavorInfos.Info + + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) + if err != nil { + ctx.ServerError("getConfigList failed:", err) + return err + } + ctx.Data["config_list"] = configList.ParaConfigs + ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo + + return nil +} + +func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { + VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) + displayJobName := form.DisplayJobName + jobName := util.ConvertDisplayJobNameToJobName(displayJobName) + uuid := form.Attachment + description := form.Description + workServerNumber := form.WorkServerNumber + engineID := form.EngineID + bootFile := form.BootFile + flavorCode := form.Flavor + params := form.Params + poolID := form.PoolID + isSaveParam := form.IsSaveParam + repo := ctx.Repo.Repository + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" + dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName + isLatestVersion := modelarts.IsLatestVersion + FlavorName := form.FlavorName + VersionCount := modelarts.VersionCount + EngineName := form.EngineName + + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) + if err != nil { + log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + return + } else { + if count >= 1 { + log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) + return + } + } + + if err := paramCheckCreateTrainJob(form); err != nil { + log.Error("paramCheckCreateTrainJob failed:(%v)", err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } + //Determine whether the task name of the task in the project is duplicated + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) + if err == nil { + if len(tasks) != 0 { + log.Error("the job name did already exist", ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form) + return + } + } else { + if !models.IsErrJobNotExist(err) { + log.Error("system error, %v", err, ctx.Data["MsgID"]) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + return + } + } + + //todo: del the codeLocalPath + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branch_name) + + if err := downloadCode(repo, codeLocalPath, branch_name); err != nil { + log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) + return + } + + // parentDir := VersionOutputPath + "/" + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) + return + } + + var parameters models.Parameters + param := make([]models.Parameter, 0) + existDeviceTarget := false + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) + return + } + + for _, parameter := range parameters.Parameter { + if parameter.Label == modelarts.DeviceTarget { + existDeviceTarget = true + } + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + if !existDeviceTarget { + param = append(param, models.Parameter{ + Label: modelarts.DeviceTarget, + Value: modelarts.Ascend, + }) + } + + //save param config + if isSaveParam == "on" { + saveparams := append(param, models.Parameter{ + Label: modelarts.TrainUrl, + Value: outputObsPath, + }, models.Parameter{ + Label: modelarts.DataUrl, + Value: dataPath, + }) + if form.ParameterTemplateName == "" { + log.Error("ParameterTemplateName is empty") + trainJobNewDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) + return + } + + _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ + ConfigName: form.ParameterTemplateName, + Description: form.PrameterDescription, + DataUrl: dataPath, + AppUrl: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + TrainUrl: outputObsPath, + Flavor: models.Flavor{ + Code: flavorCode, + }, + WorkServerNum: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Parameter: saveparams, + }) + + if err != nil { + log.Error("Failed to CreateTrainJobConfig: %v", err) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) + return + } + } + + req := &modelarts.GenerateTrainJobReq{ + JobName: jobName, + DisplayJobName: displayJobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: param, + CommitID: commitID, + IsLatestVersion: isLatestVersion, + BranchName: branch_name, + Params: form.Params, + FlavorName: FlavorName, + EngineName: EngineName, + VersionCount: VersionCount, + TotalVersionCount: modelarts.TotalVersionCount, + } + + //将params转换Parameters.Parameter,出错时返回给前端 + var Parameters modelarts.Parameters + if err := json.Unmarshal([]byte(params), &Parameters); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return + } + + err = modelarts.GenerateTrainJob(ctx, req) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index a64eb0fae8..ab3e2dd559 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1085,14 +1085,26 @@ func RegisterRoutes(m *macaron.Macaron) { }, context.RepoRef()) m.Group("/grampus", func() { m.Group("/train-job", func() { - m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) - m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) - m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + m.Group("/gpu", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew) + //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) + }) + m.Group("/npu", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) + //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) }) - m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusNew) - m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusCreate) }) }, context.RepoRef()) m.Group("/modelmanage", func() { diff --git a/templates/repo/grampus/trainjob/gpu/new.tmpl b/templates/repo/grampus/trainjob/gpu/new.tmpl new file mode 100755 index 0000000000..7c42eba756 --- /dev/null +++ b/templates/repo/grampus/trainjob/gpu/new.tmpl @@ -0,0 +1,447 @@ +{{template "base/head" .}} + + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+ {{template "base/alert" .}} +

+ {{.i18n.Tr "repo.modelarts.train_job.new"}} +

+
+ +
+ {{.CsrfTokenHtml}} + + + +

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
+ + +
+
+ + + {{.i18n.Tr "cloudbrain.job_name_rule"}} +
+ +
+ + +
+
+ +

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

+ + +
+ + +
+ + + +
+ + +
+ + + +
+ +
+ +
+ + {{if .bootFile}} + + {{else}} + + {{end}} + + + + 查看样例 +
+ + + {{template "custom/select_dataset_train" .}} + 训练脚本存储在/code中,数据集存储在/dataset中,训练输出请存储在/model中以供后续下载。 +
+ + {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} + +
+ {{if .params}} + {{if ne 0 (len .params)}} + {{range $k ,$v := .params}} +
+
+ +
+
+ +
+ + + + +
+ {{end}} + {{end}} + {{end}} +
+
+ +
+ + +
+ +
+ + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+ + + +
+
+
+
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/grampus/trainjob/gpu/show.tmpl b/templates/repo/grampus/trainjob/gpu/show.tmpl new file mode 100755 index 0000000000..f1087abcfb --- /dev/null +++ b/templates/repo/grampus/trainjob/gpu/show.tmpl @@ -0,0 +1,731 @@ +{{template "base/head" .}} + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+

+ +

+ {{range $k ,$v := .version_list_task}} +
+ +
+
+
+ + + +
+ {{TimeSinceUnix1 .CreatedUnix}} + + {{$.i18n.Tr "repo.modelarts.status"}}: + {{.Status}} + + {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}: + {{$.duration}} + +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ {{$.i18n.Tr "repo.cloudbrain_task"}} + +
+ {{.DisplayJobName}} +
+
+ {{$.i18n.Tr "repo.modelarts.status"}} + +
+ {{.Status}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} + +
+ {{TimeSinceUnix1 .CreatedUnix}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} + +
+ {{$.duration}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} + +
+ {{$.resource_type}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + +
+ {{$.i18n.Tr "cloudbrain.gpu_num"}}:{{$.GpuNum}},{{$.i18n.Tr "cloudbrain.cpu_num"}}:{{$.CpuNum}},{{$.i18n.Tr "cloudbrain.memory"}}(MB):{{$.MemMiB}},{{$.i18n.Tr "cloudbrain.shared_memory"}}(MB):{{$.ShareMemMiB}} +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ {{$.i18n.Tr "cloudbrain.mirror"}} + +
+ {{.Image}} +
+
+ {{$.i18n.Tr "repo.modelarts.code_version"}} + +
+ {{.BranchName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + +
+ {{.BootFile}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} + +
+ {{.DatasetName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + +
+ {{.Parameters}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.description"}} + +
+ {{.Description}} +
+
+
+
+
+ +
+
+ +
+
+ +
+ + + + + +
+ +
+ +
+ +
+
+ +
+ + +

+                            
+ +
+ +
+ +
+ + + +
+ +
+
+ +
+
+
+ {{end}} {{template "base/paginate" .}} +
+ +
+ +
+ + +
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl new file mode 100755 index 0000000000..6f5f5455f0 --- /dev/null +++ b/templates/repo/grampus/trainjob/npu/new.tmpl @@ -0,0 +1,475 @@ +{{template "base/head" .}} + + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+ {{template "base/alert" .}} +

+ {{.i18n.Tr "repo.modelarts.train_job.new"}} +

+
+ +
+ {{.CsrfTokenHtml}} + + + +

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
+ + +
+
+ + + {{.i18n.Tr "cloudbrain.job_name_rule"}} +
+ +
+ + +
+
+ +

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

+ + +
+ + +
+ + + +
+ +
+ +
+ +
+ + +
+ +
+ +
+ + {{if .bootFile}} + + {{else}} + + {{end}} + + + + {{.i18n.Tr "cloudbrain.view_sample"}} +
+ + {{template "custom/select_dataset_train" .}} + {{.i18n.Tr "cloudbrain.dataset_path_rule"}} +
+ + {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} + +
+ {{if ne 0 (len .params)}} + {{range $k ,$v := .params}} +
+
+ +
+
+ +
+ + + + +
+ {{end}} + {{end}} +
+
+ + + + + + +
+ + +
+
+ + +
+ + +
+ +
+ +
+
+ +
+ + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+ + + +
+
+
+
+{{template "base/footer" .}} + + diff --git a/templates/repo/grampus/trainjob/npu/show.tmpl b/templates/repo/grampus/trainjob/npu/show.tmpl new file mode 100755 index 0000000000..8f168fcf9f --- /dev/null +++ b/templates/repo/grampus/trainjob/npu/show.tmpl @@ -0,0 +1,1008 @@ +{{template "base/head" .}} + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+

+ +

+ {{range $k ,$v := .version_list_task}} +
+
+
+
+ + + +
+ {{$.CsrfTokenHtml}} + {{if and (.CanModify) (eq .Status "COMPLETED") ($.Permission.CanWrite $.UnitTypeModelManage) }} + {{$.i18n.Tr "repo.modelarts.create_model"}} + {{else}} + {{$.i18n.Tr "repo.modelarts.create_model"}} + {{end}} + + {{if .CanModify}} + {{$.i18n.Tr "repo.modelarts.modify"}} + {{else}} + {{$.i18n.Tr "repo.modelarts.modify"}} + {{end}} + + {{if .CanDel}} + {{$.i18n.Tr "repo.stop"}} + {{else}} + {{$.i18n.Tr "repo.stop"}} + {{end}} + + + {{if .CanDel}} + {{$.i18n.Tr "repo.delete"}} + {{else}} + {{$.i18n.Tr "repo.delete"}} + {{end}} +
+
+ + + {{if not (eq .Cloudbrain.StartTime 0)}} + {{TimeSinceUnix1 .Cloudbrain.StartTime}} + {{else}} + {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} + {{end}} + + {{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}} + + {{$.i18n.Tr "repo.modelarts.parent_version"}}:{{.PreVersionName}} + {{$.i18n.Tr "repo.modelarts.status"}}: + {{.Status}} + + {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}: + {{.TrainJobDuration}} + + +
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ {{$.i18n.Tr "repo.cloudbrain_task"}} + +
+ {{.DisplayJobName}} +
+
+ {{$.i18n.Tr "repo.modelarts.status"}} + +
+ {{.Status}} +
+
+ {{$.i18n.Tr "repo.modelarts.run_version"}} + +
+ {{.VersionName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} + +
+ + {{if not (eq .Cloudbrain.StartTime 0)}} + {{TimeSinceUnix1 .Cloudbrain.StartTime}} + {{else}} + {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} + {{end}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} + +
+ {{.TrainJobDuration}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + +
+ {{.FlavorName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.compute_node"}} + +
+ {{.WorkServerNumber}} +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ {{$.i18n.Tr "repo.modelarts.train_job.AI_driver"}} + +
+ {{.EngineName}} +
+
+ {{$.i18n.Tr "repo.modelarts.code_version"}} + +
+ {{.BranchName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + +
+ {{.BootFile}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} + +
+ {{.DatasetName}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + +
+ {{.Parameters}} +
+
+ {{$.i18n.Tr "repo.modelarts.train_job.description"}} + +
+ {{.Cloudbrain.Description}} +
+
+
+
+
+ +
+
+
+
+ + + + + + + +
+ + +

+                            
+ +
+ +
+
+ + + +
+ +
+
+ +
+
+
+ {{end}} {{template "base/paginate" .}} +
+ +
+ +
+ +
+ +
+
+{{template "base/footer" .}} + + \ No newline at end of file -- 2.34.1 From bdc509391ff11aa011717a30e29029e672487a76 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 23 May 2022 17:57:13 +0800 Subject: [PATCH 03/56] create job --- models/cloudbrain.go | 59 ++++ modules/auth/grampus.go | 23 ++ modules/cloudbrain/cloudbrain.go | 2 - modules/grampus/grampus.go | 246 +++----------- modules/grampus/resty.go | 533 +------------------------------ routers/repo/grampus.go | 185 +++++------ routers/repo/modelarts.go | 19 +- routers/routes/routes.go | 2 +- 8 files changed, 234 insertions(+), 835 deletions(-) create mode 100755 modules/auth/grampus.go diff --git a/models/cloudbrain.go b/models/cloudbrain.go index e28ba3ea5c..f775626ada 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -24,6 +24,7 @@ type ModelArtsJobStatus string const ( TypeCloudBrainOne int = iota TypeCloudBrainTwo + TypeCloudBrainGrampus TypeCloudBrainAll = -1 ) @@ -98,6 +99,14 @@ const ( ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 DURATION_STR_ZERO = "00:00:00" + + //grampus + GrampusStatusPending = "pending" + GrampusStatusRunning = "running" + GrampusStatusFailed = "failed" + GrampusStatusSucceeded = "succeeded" + GrampusStatusStopped = "stopped" + GrampusStatusUnknown = "unknown" ) type Cloudbrain struct { @@ -328,6 +337,7 @@ type CloudbrainsOptions struct { JobTypeNot bool NeedRepoInfo bool RepoIDList []int64 + ComputeResource string } type TaskPod struct { @@ -1150,6 +1160,44 @@ type LogFile struct { Name string } +//Grampus +type GrampusResult struct { + ErrorCode int `json:"errorCode"` + ErrorMsg string `json:"errorMsg"` +} + +type GrampusJobInfo struct { + StartedAt int64 `json:"startedAt"` + RunSec int64 `json:"runSec"` + CompletedAt int64 `json:"completedAt"` + CreatedAt int64 `json:"createdAt"` + UpdatedAt int64 `json:"updatedAt"` + Desc string `json:"desc"` + JobID string `json:"id"` + Name string `json:"name"` + Status string `json:"status"` + UserID string `json:"userId"` + Tasks []GrampusTasks `json:"tasks"` +} + +type CreateGrampusJobResponse struct { + GrampusResult + JobInfo GrampusJobInfo `json:"otJob"` +} + +type GrampusTasks struct { + Command string `json:"command"` + Name string `json:"name"` + ImageId string `json:"imageId"` + ResourceSpecId string `json:"resourceSpecId"` + ImageUrl string `json:"imageUrl"` +} + +type CreateGrampusJobRequest struct { + Name string `json:"name"` + Tasks []GrampusTasks `json:"tasks"` +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { sess := x.NewSession() defer sess.Close() @@ -1179,6 +1227,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } + if len(opts.ComputeResource) >= 0 { + cond = cond.And( + builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource}, + ) + } + if len(opts.JobTypes) > 0 { if opts.JobTypeNot { cond = cond.And( @@ -1589,6 +1643,11 @@ func GetCloudbrainInferenceJobCountByUserID(userID int64) (int, error) { return int(count), err } +func GetGrampusCountByUserID(userID int64, jobType, computeResource string) (int, error) { + count, err := x.In("status", GrampusStatusPending, GrampusStatusRunning).And("job_type = ? and user_id = ? and type = ?", jobType, userID, TypeCloudBrainGrampus).And("compute_resource = ?", computeResource).Count(new(Cloudbrain)) + return int(count), err +} + func UpdateInferenceJob(job *Cloudbrain) error { return updateInferenceJob(x, job) } diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go new file mode 100755 index 0000000000..2cfaf70061 --- /dev/null +++ b/modules/auth/grampus.go @@ -0,0 +1,23 @@ +package auth + +import ( + "gitea.com/macaron/binding" + "gitea.com/macaron/macaron" +) + +type CreateGrampusTrainJobForm struct { + DisplayJobName string `form:"display_job_name" binding:"Required"` + JobName string `form:"job_name" binding:"Required"` + Attachment string `form:"attachment" binding:"Required"` + BootFile string `form:"boot_file" binding:"Required"` + Flavor string `form:"flavor" binding:"Required"` + Params string `form:"run_para_list" binding:"Required"` + Description string `form:"description"` + BranchName string `form:"branch_name" binding:"Required"` + FlavorName string `form:"flaver_names" binding:"Required"` + EngineName string `form:"engine_names" binding:"Required"` +} + +func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { + return validate(errs, ctx.Data, f, ctx.Locale) +} diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index a71389741d..dc1d0e4609 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -48,8 +48,6 @@ func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, er if !ctx.IsSigned { return false } - log.Info("is repo owner:" + strconv.FormatBool(ctx.IsUserRepoOwner())) - log.Info("is user admin:" + strconv.FormatBool(ctx.IsUserSiteAdmin())) if err != nil { return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index b60afb5cce..26e143429f 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -1,13 +1,11 @@ package grampus import ( - "code.gitea.io/gitea/modules/timeutil" - "strconv" - "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/notification" + "code.gitea.io/gitea/modules/timeutil" ) const ( @@ -21,19 +19,6 @@ const ( NotebookType = "Ascend" FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" - //train-job - // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" - // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" - // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + - // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + - // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + - // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + - // "]}" - // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + - // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + - // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + - // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + - // "]}" CodePath = "/code/" OutputPath = "/output/" ResultPath = "/result/" @@ -65,7 +50,12 @@ var ( ) type GenerateTrainJobReq struct { - JobName string + JobName string + Command string + ResourceSpecId string + ImageUrl string + ImageId string + DisplayJobName string Uuid string Description string @@ -74,15 +64,10 @@ type GenerateTrainJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string - LogUrl string - PoolID string WorkServerNumber int EngineID int64 - Parameters []models.Parameter CommitID string IsLatestVersion string - Params string BranchName string PreVersionId int64 PreVersionName string @@ -90,139 +75,54 @@ type GenerateTrainJobReq struct { VersionCount int EngineName string TotalVersionCount int -} - -type GenerateInferenceJobReq struct { - JobName string - DisplayJobName string - Uuid string - Description string - CodeObsPath string - BootFile string - BootFileUrl string - DataUrl string - TrainUrl string - FlavorCode string - LogUrl string - PoolID string - WorkServerNumber int - EngineID int64 - Parameters []models.Parameter - CommitID string - Params string - BranchName string - FlavorName string - EngineName string - LabelName string - IsLatestVersion string - VersionCount int - TotalVersionCount int - ModelName string - ModelVersion string - CkptName string - ResultUrl string -} - -type VersionInfo struct { - Version []struct { - ID int `json:"id"` - Value string `json:"value"` - } `json:"version"` -} - -type Flavor struct { - Info []struct { - Code string `json:"code"` - Value string `json:"value"` - } `json:"flavor"` -} - -type Engine struct { - Info []struct { - ID int `json:"id"` - Value string `json:"value"` - } `json:"engine"` -} - -type ResourcePool struct { - Info []struct { - ID string `json:"id"` - Value string `json:"value"` - } `json:"resource_pool"` -} - -// type Parameter struct { -// Label string `json:"label"` -// Value string `json:"value"` -// } - -// type Parameters struct { -// Parameter []Parameter `json:"parameter"` -// } - -type Parameters struct { - Parameter []struct { - Label string `json:"label"` - Value string `json:"value"` - } `json:"parameter"` + ComputeResource string + DatasetName string } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { createTime := timeutil.TimeStampNow() - jobResult, err := createTrainJob(models.CreateTrainJobParams{ - JobName: req.JobName, - Description: req.Description, - Config: models.Config{ - WorkServerNum: req.WorkServerNumber, - AppUrl: req.CodeObsPath, - BootFileUrl: req.BootFileUrl, - DataUrl: req.DataUrl, - EngineID: req.EngineID, - TrainUrl: req.TrainUrl, - LogUrl: req.LogUrl, - PoolID: req.PoolID, - CreateVersion: true, - Flavor: models.Flavor{ - Code: req.FlavorCode, + jobResult, err := createJob(models.CreateGrampusJobRequest{ + Name: req.JobName, + Tasks: []models.GrampusTasks{ + { + Name: req.JobName, + Command: req.Command, + ResourceSpecId: req.ResourceSpecId, + ImageId: req.ImageId, + ImageUrl: req.ImageUrl, }, - Parameter: req.Parameters, }, }) if err != nil { - log.Error("CreateJob failed: %v", err.Error()) + log.Error("createJob failed: %v", err.Error()) return err } - attach, err := models.GetAttachmentByUUID(req.Uuid) - if err != nil { - log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) - return err - } - jobId := strconv.FormatInt(jobResult.JobID, 10) + jobID := jobResult.JobInfo.JobID err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobId, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: attach.Name, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: models.NPUResource, - EngineID: req.EngineID, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + Status: string(models.GrampusStatusPending), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainGrampus, + //VersionID: jobResult.VersionID, + //VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: req.DatasetName, + CommitID: req.CommitID, + //IsLatestVersion: req.IsLatestVersion, + ComputeResource: req.ComputeResource, + //EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + //Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + //LogUrl: req.LogUrl, + //FlavorCode: req.FlavorCode, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -237,58 +137,14 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) return err } - notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) - return nil -} - -func TransTrainJobStatus(status int) string { - switch status { - case 0: - return "UNKNOWN" - case 1: - return "INIT" - case 2: - return "IMAGE_CREATING" - case 3: - return "IMAGE_FAILED" - case 4: - return "SUBMIT_TRYING" - case 5: - return "SUBMIT_FAILED" - case 6: - return "DELETE_FAILED" - case 7: - return "WAITING" - case 8: - return "RUNNING" - case 9: - return "KILLING" - case 10: - return "COMPLETED" - case 11: - return "FAILED" - case 12: - return "KILLED" - case 13: - return "CANCELED" - case 14: - return "LOST" - case 15: - return "SCALING" - case 16: - return "SUBMIT_MODEL_FAILED" - case 17: - return "DEPLOY_SERVICE_FAILED" - case 18: - return "CHECK_INIT" - case 19: - return "CHECK_RUNNING" - case 20: - return "CHECK_RUNNING_COMPLETED" - case 21: - return "CHECK_FAILED" - default: - return strconv.Itoa(status) + var actionType models.ActionType + if req.ComputeResource == models.NPUResource { + actionType = models.ActionCreateTrainTask + } else if req.ComputeResource == models.GPUResource { + actionType = models.ActionCreateGPUTrainTask } + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType) + + return nil } diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 693ba71a1d..f64c98b554 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -23,19 +23,18 @@ const ( urlOpenApiV1 = "/openapi/v1/" urlGetToken = urlOpenApiV1 + "token" - urlNotebook = "/demanager/instances" - urlTrainJob = "/training-jobs" + urlTrainJob = urlOpenApiV1 + "trainjob" urlResourceSpecs = "/job/resource-specs" urlTrainJobConfig = "/training-job-configs" errorCodeExceedLimit = "ModelArts.0118" urlNotebook2 = "" - modelartsIllegalToken = "" + errorIllegalToken = 1005 ) type GetTokenParams struct { - UserName string `json:"user_name"` + UserName string `json:"username"` Password string `json:"password"` } @@ -92,44 +91,34 @@ func getToken() error { return nil } -func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { +func createJob(req models.CreateGrampusJobRequest) (*models.CreateGrampusJobResponse, error) { checkSetting() client := getRestyClient() - var result models.CreateNotebookResult + var result models.CreateGrampusJobResponse retry := 0 sendjob: - res, err := client.R(). + _, err := client.R(). SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). - SetBody(createJobParams). + SetBody(req). SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) + Post(HOST + urlTrainJob) if err != nil { - return nil, fmt.Errorf("resty create notebook: %s", err) + return nil, fmt.Errorf("resty CreateJob: %s", err) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + if result.ErrorCode == errorIllegalToken && retry < 1 { retry++ _ = getToken() goto sendjob } - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == errorCodeExceedLimit { - response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" - } - return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if result.ErrorCode != 0 { + log.Error("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil @@ -147,7 +136,7 @@ sendjob: SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) if err != nil { return nil, fmt.Errorf("resty GetJob: %v", err) @@ -174,217 +163,6 @@ sendjob: return &result, nil } -func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) { - checkSetting() - client := getRestyClient() - var result models.GetNotebook2Result - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) - - if err != nil { - return nil, fmt.Errorf("resty GetJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - -func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { - checkSetting() - client := getRestyClient() - var result models.NotebookActionResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetBody(param). - SetAuthToken(TOKEN). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action") - - if err != nil { - return &result, fmt.Errorf("resty StopJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) - return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - -func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) { - checkSetting() - client := getRestyClient() - var result models.NotebookActionResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs)) - - if err != nil { - return &result, fmt.Errorf("resty ManageNotebook2: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - -func DelNotebook(jobID string) (*models.NotebookDelResult, error) { - checkSetting() - client := getRestyClient() - var result models.NotebookDelResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) - - if err != nil { - return &result, fmt.Errorf("resty DelJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - -func DelNotebook2(jobID string) (*models.NotebookDelResult, error) { - checkSetting() - client := getRestyClient() - var result models.NotebookDelResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID) - - if err != nil { - return &result, fmt.Errorf("resty DelJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - func DelJob(jobID string) (*models.NotebookDelResult, error) { checkSetting() client := getRestyClient() @@ -397,7 +175,7 @@ sendjob: SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID) + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) if err != nil { return &result, fmt.Errorf("resty DelJob: %v", err) @@ -424,45 +202,6 @@ sendjob: return &result, nil } -func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) { - checkSetting() - client := getRestyClient() - var result models.NotebookGetJobTokenResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token") - - if err != nil { - return &result, fmt.Errorf("resty GetJobToken: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) - return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { checkSetting() client := getRestyClient() @@ -519,61 +258,6 @@ sendjob: return &result, nil } -func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { - checkSetting() - client := getRestyClient() - var result models.CreateTrainJobResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetBody(createJobVersionParams). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") - - if err != nil { - return nil, fmt.Errorf("resty create train-job version: %s", err) - } - - req, _ := json.Marshal(createJobVersionParams) - log.Info("%s", req) - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." - DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." - if temp.ErrorMsg == BootFileErrorMsg { - log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("启动文件错误!") - } - if temp.ErrorMsg == DataSetErrorMsg { - log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("数据集错误!") - } - return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) - } - - return &result, nil -} - func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { checkSetting() client := getRestyClient() @@ -616,145 +300,6 @@ sendjob: return &result, nil } -func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) { - checkSetting() - client := getRestyClient() - var result models.CreateTrainJobConfigResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetBody(req). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) - - if err != nil { - return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - //temp, _ := json.Marshal(req) - //log.Info("%s", temp) - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) - } - - return &result, nil -} - -func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) { - checkSetting() - client := getRestyClient() - var result models.GetConfigListResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetQueryParams(map[string]string{ - "per_page": strconv.Itoa(perPage), - "page": strconv.Itoa(page), - "sortBy": sortBy, - "order": order, - "search_content": searchContent, - "config_type": configType, - }). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) - - if err != nil { - return nil, fmt.Errorf("resty GetConfigList: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg) - } - - return &result, nil -} - -func GetParaConfig(configName, configType string) (models.GetConfigResult, error) { - checkSetting() - client := getRestyClient() - var result models.GetConfigResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetQueryParams(map[string]string{ - "config_type": configType, - }). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName) - - if err != nil { - return result, fmt.Errorf("resty GetParaConfig: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg) - } - - return result, nil -} - func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { checkSetting() client := getRestyClient() @@ -1062,51 +607,3 @@ sendjob: return &result, nil } - -func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) { - checkSetting() - client := getRestyClient() - var result models.CreateNotebookResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetBody(createJobParams). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2) - - if err != nil { - return nil, fmt.Errorf("resty create notebook2: %s", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - if response.ErrorCode == errorCodeExceedLimit { - response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" - } - if response.ErrorCode == modelartsIllegalToken && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 8f31896616..c8c54a9287 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -3,9 +3,11 @@ package repo import ( "code.gitea.io/gitea/modules/auth" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/util" "encoding/json" + "errors" "io/ioutil" "net/http" "os" @@ -149,7 +151,7 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error { } func GrampusTrainJobNPUNew(ctx *context.Context) { - err := trainJobNpuNewDataPrepare(ctx) + err := grampusTrainJobNpuNewDataPrepare(ctx) if err != nil { ctx.ServerError("get new train-job info failed", err) return @@ -157,7 +159,7 @@ func GrampusTrainJobNPUNew(ctx *context.Context) { ctx.HTML(200, tplGrampusTrainJobNPUNew) } -func trainJobNpuNewDataPrepare(ctx *context.Context) error { +func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error { ctx.Data["PageIsCloudBrain"] = true t := time.Now() @@ -215,110 +217,122 @@ func trainJobNpuNewDataPrepare(ctx *context.Context) error { return nil } -func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { +func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error { + if !strings.HasSuffix(form.BootFile, ".py") { + log.Error("the boot file(%s) must be a python file", form.BootFile) + return errors.New("启动文件必须是python文件") + } + + if form.BranchName == "" { + log.Error("the branch must not be null!", form.BranchName) + return errors.New("代码分支不能为空!") + } + + return nil +} + +func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) + //todo:del + jobName = displayJobName uuid := form.Attachment description := form.Description - workServerNumber := form.WorkServerNumber - engineID := form.EngineID bootFile := form.BootFile - flavorCode := form.Flavor params := form.Params - poolID := form.PoolID - isSaveParam := form.IsSaveParam repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath - outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" - logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" - branch_name := form.BranchName + branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion FlavorName := form.FlavorName VersionCount := modelarts.VersionCount EngineName := form.EngineName - count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) + log.Info(jobName) + + count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) if err != nil { - log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form) return } } - if err := paramCheckCreateTrainJob(form); err != nil { + if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) return } - //Determine whether the task name of the task in the project is duplicated + //check whether the task name in the project is duplicated tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) return } } - //todo: del the codeLocalPath + //prepare code and out path _, err = ioutil.ReadDir(codeLocalPath) if err == nil { os.RemoveAll(codeLocalPath) } gitRepo, _ := git.OpenRepository(repo.RepoPath()) - commitID, _ := gitRepo.GetBranchCommitID(branch_name) + commitID, _ := gitRepo.GetBranchCommitID(branchName) - if err := downloadCode(repo, codeLocalPath, branch_name); err != nil { + if err := downloadCode(repo, codeLocalPath, branchName); err != nil { log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form) return } //todo: upload code (send to file_server todo this work?) if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form) return } if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form) return } - // parentDir := VersionOutputPath + "/" if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) return } + //prepare command + //todo: download code, download dataset, unzip dataset, exec code, upload model var parameters models.Parameters param := make([]models.Parameter, 0) existDeviceTarget := false @@ -326,8 +340,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra err := json.Unmarshal([]byte(params), ¶meters) if err != nil { log.Error("Failed to Unmarshal params: %s (%v)", params, err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form) return } @@ -350,67 +364,32 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra }) } - //save param config - if isSaveParam == "on" { - saveparams := append(param, models.Parameter{ - Label: modelarts.TrainUrl, - Value: outputObsPath, - }, models.Parameter{ - Label: modelarts.DataUrl, - Value: dataPath, - }) - if form.ParameterTemplateName == "" { - log.Error("ParameterTemplateName is empty") - trainJobNewDataPrepare(ctx) - ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) - return - } - - _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ - ConfigName: form.ParameterTemplateName, - Description: form.PrameterDescription, - DataUrl: dataPath, - AppUrl: codeObsPath, - BootFileUrl: codeObsPath + bootFile, - TrainUrl: outputObsPath, - Flavor: models.Flavor{ - Code: flavorCode, - }, - WorkServerNum: workServerNumber, - EngineID: int64(engineID), - LogUrl: logObsPath, - PoolID: poolID, - Parameter: saveparams, - }) - - if err != nil { - log.Error("Failed to CreateTrainJobConfig: %v", err) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) - return - } - } - - req := &modelarts.GenerateTrainJobReq{ - JobName: jobName, - DisplayJobName: displayJobName, - DataUrl: dataPath, - Description: description, - CodeObsPath: codeObsPath, - BootFileUrl: codeObsPath + bootFile, - BootFile: bootFile, - TrainUrl: outputObsPath, - FlavorCode: flavorCode, - WorkServerNumber: workServerNumber, - EngineID: int64(engineID), - LogUrl: logObsPath, - PoolID: poolID, - Uuid: uuid, - Parameters: param, - CommitID: commitID, - IsLatestVersion: isLatestVersion, - BranchName: branch_name, - Params: form.Params, + req := &grampus.GenerateTrainJobReq{ + JobName: jobName, + DisplayJobName: displayJobName, + ComputeResource: models.NPUResource, + Command: "echo \"test\"", + ResourceSpecId: "modelarts.kat1.xlarge", + ImageUrl: "", + ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64", + + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + //TrainUrl: outputObsPath, + //FlavorCode: flavorCode, + WorkServerNumber: 1, + //EngineID: int64(engineID), + //LogUrl: logObsPath, + //PoolID: poolID, + Uuid: uuid, + //Parameters: param, + CommitID: commitID, + IsLatestVersion: isLatestVersion, + BranchName: branchName, + //Params: form.Params, FlavorName: FlavorName, EngineName: EngineName, VersionCount: VersionCount, @@ -424,11 +403,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra return } - err = modelarts.GenerateTrainJob(ctx, req) + err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) - trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 3b4e8738fc..dfb2631a01 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -559,24 +559,11 @@ func TrainJobIndex(ctx *context.Context) { } listType := ctx.Query("listType") - if len(listType) == 0 { - listType = models.AllResource - } ctx.Data["ListType"] = listType - typeCloudBrain := models.TypeCloudBrainAll - if listType == models.GPUResource { - typeCloudBrain = models.TypeCloudBrainOne - } else if listType == models.NPUResource { - typeCloudBrain = models.TypeCloudBrainTwo - } else if listType == models.AllResource { - typeCloudBrain = models.TypeCloudBrainAll + if listType == models.AllResource { + listType = "" } - //else { - // log.Error("listType(%s) error", listType) - // ctx.ServerError("listType error", errors.New("listType error")) - // return - //} var jobTypes []string jobTypes = append(jobTypes, string(models.JobTypeTrain)) @@ -586,10 +573,10 @@ func TrainJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: typeCloudBrain, JobTypeNot: false, JobTypes: jobTypes, IsLatestVersion: modelarts.IsLatestVersion, + ComputeResource: listType, }) if err != nil { ctx.ServerError("Cloudbrain", err) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index ab3e2dd559..6348e4ba8b 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1103,7 +1103,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) }) m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) - //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) + m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobNpuCreate) }) }) }, context.RepoRef()) -- 2.34.1 From ad72d9510337e40c9f10fbe4887c234d998da2f8 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 23 May 2022 20:21:44 +0800 Subject: [PATCH 04/56] get job --- models/cloudbrain.go | 19 ++++++++++++------- modules/grampus/grampus.go | 19 ++++++++++++++----- modules/grampus/resty.go | 24 +++++++++--------------- routers/api/v1/repo/modelarts.go | 26 +++++++++++++++++++++++++- routers/repo/cloudbrain.go | 25 +++++++++++++++++++++++++ routers/repo/modelarts.go | 1 + 6 files changed, 86 insertions(+), 28 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index f775626ada..694e277d41 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -102,11 +102,11 @@ const ( //grampus GrampusStatusPending = "pending" - GrampusStatusRunning = "running" - GrampusStatusFailed = "failed" - GrampusStatusSucceeded = "succeeded" - GrampusStatusStopped = "stopped" - GrampusStatusUnknown = "unknown" + GrampusStatusRunning = "RUNNING" + GrampusStatusFailed = "FAILED" + GrampusStatusSucceeded = "SUCCEEDED" + GrampusStatusStopped = "STOPPED" + GrampusStatusUnknown = "UNKNOWN" ) type Cloudbrain struct { @@ -214,7 +214,7 @@ func ConvertDurationToStr(duration int64) string { } func IsTrainJobTerminal(status string) bool { - return status == string(ModelArtsTrainJobCompleted) || status == string(ModelArtsTrainJobFailed) || status == string(ModelArtsTrainJobKilled) + return status == string(ModelArtsTrainJobCompleted) || status == string(ModelArtsTrainJobFailed) || status == string(ModelArtsTrainJobKilled) || status == GrampusStatusFailed || status == GrampusStatusStopped || status == GrampusStatusSucceeded } func IsModelArtsDebugJobTerminal(status string) bool { @@ -1185,6 +1185,11 @@ type CreateGrampusJobResponse struct { JobInfo GrampusJobInfo `json:"otJob"` } +type GetGrampusJobResponse struct { + GrampusResult + JobInfo GrampusJobInfo `json:"otJob"` +} + type GrampusTasks struct { Command string `json:"command"` Name string `json:"name"` @@ -1227,7 +1232,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if len(opts.ComputeResource) >= 0 { + if len(opts.ComputeResource) > 0 { cond = cond.And( builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource}, ) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 26e143429f..71e368fa6f 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -6,6 +6,7 @@ import ( "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/notification" "code.gitea.io/gitea/modules/timeutil" + "strings" ) const ( @@ -100,7 +101,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error jobID := jobResult.JobInfo.JobID err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: string(models.GrampusStatusPending), + Status: TransTrainJobStatus(jobResult.JobInfo.Status), UserID: ctx.User.ID, RepoID: ctx.Repo.Repository.ID, JobID: jobID, @@ -110,10 +111,10 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error Type: models.TypeCloudBrainGrampus, //VersionID: jobResult.VersionID, //VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: req.DatasetName, - CommitID: req.CommitID, - //IsLatestVersion: req.IsLatestVersion, + Uuid: req.Uuid, + DatasetName: req.DatasetName, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, ComputeResource: req.ComputeResource, //EngineID: req.EngineID, TrainUrl: req.TrainUrl, @@ -148,3 +149,11 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error return nil } + +func TransTrainJobStatus(status string) string { + if status == "pending" { + status = "waiting" + } + + return strings.ToUpper(status) +} diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index f64c98b554..6f1ee72d67 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -124,40 +124,34 @@ sendjob: return &result, nil } -func GetJob(jobID string) (*models.GetNotebookResult, error) { +func GetJob(jobID string) (*models.GetGrampusJobResponse, error) { checkSetting() client := getRestyClient() - var result models.GetNotebookResult + var result models.GetGrampusJobResponse retry := 0 sendjob: - res, err := client.R(). + _, err := client.R(). SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) + Get(HOST + urlTrainJob + "/" + jobID) if err != nil { return nil, fmt.Errorf("resty GetJob: %v", err) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + if result.ErrorCode == errorIllegalToken && retry < 1 { retry++ + log.Info("retry get token") _ = getToken() goto sendjob } - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + if result.ErrorCode != 0 { + log.Error("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 9e4edea032..06e4bea44d 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -6,6 +6,7 @@ package repo import ( + "code.gitea.io/gitea/modules/grampus" "net/http" "strconv" "strings" @@ -167,7 +168,7 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { log.Error("UpdateJob failed:", err) } } - } else { + } else if job.Type == models.TypeCloudBrainTwo { result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) if err != nil { ctx.NotFound(err) @@ -181,6 +182,29 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { job.Duration = result.Duration / 1000 job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) + if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { + job.EndTime = job.StartTime.Add(job.Duration) + } + job.CorrectCreateUnix() + err = models.UpdateTrainJobVersion(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } + } else if job.Type == models.TypeCloudBrainGrampus { + result, err := grampus.GetJob(jobID) + if err != nil { + log.Error("GetJob(%s) failed:%v", job.JobName, err) + ctx.NotFound(err) + return + } + + if job.StartTime == 0 && result.JobInfo.StartedAt > 0 { + job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000) + } + job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + job.Duration = result.JobInfo.RunSec + job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) + if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { job.EndTime = job.StartTime.Add(job.Duration) } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index df27a12c26..b95fe2fe55 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -2,6 +2,7 @@ package repo import ( "bufio" + "code.gitea.io/gitea/modules/grampus" "encoding/json" "errors" "fmt" @@ -1492,7 +1493,31 @@ func SyncCloudbrainStatus() { } else { log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType) } + } else if task.Type == models.TypeCloudBrainGrampus { + result, err := grampus.GetJob(task.JobID) + if err != nil { + log.Error("GetTrainJob(%s) failed:%v", task.JobName, err) + continue + } + if result != nil { + task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + task.Duration = result.JobInfo.RunSec + task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) + + if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { + task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000) + } + if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { + task.EndTime = task.StartTime.Add(task.Duration) + } + task.CorrectCreateUnix() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } } else { log.Error("task.Type(%s) is error:%d", task.JobName, task.Type) } diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index dfb2631a01..ea4ff3b1d2 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -577,6 +577,7 @@ func TrainJobIndex(ctx *context.Context) { JobTypes: jobTypes, IsLatestVersion: modelarts.IsLatestVersion, ComputeResource: listType, + Type: models.TypeCloudBrainAll, }) if err != nil { ctx.ServerError("Cloudbrain", err) -- 2.34.1 From 340f25c73df4aab6906a134272f7352be7843b66 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 24 May 2022 18:10:36 +0800 Subject: [PATCH 05/56] show job --- models/cloudbrain.go | 10 + modules/cloudbrain/cloudbrain.go | 1 - modules/grampus/resty.go | 393 +--------- routers/api/v1/api.go | 9 + routers/api/v1/repo/modelarts.go | 2 +- routers/repo/cloudbrain.go | 2 +- routers/repo/grampus.go | 183 ++++- routers/repo/modelarts.go | 5 - routers/routes/routes.go | 18 +- templates/repo/grampus/trainjob/gpu/show.tmpl | 731 ------------------ .../repo/grampus/trainjob/{npu => }/show.tmpl | 69 +- templates/repo/modelarts/trainjob/index.tmpl | 6 +- 12 files changed, 245 insertions(+), 1184 deletions(-) delete mode 100755 templates/repo/grampus/trainjob/gpu/show.tmpl rename templates/repo/grampus/trainjob/{npu => }/show.tmpl (90%) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 694e277d41..6a6645d6b4 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1190,6 +1190,11 @@ type GetGrampusJobResponse struct { JobInfo GrampusJobInfo `json:"otJob"` } +type GrampusStopJobResponse struct { + GrampusResult + StoppedAt int64 `json:"stoppedAt"` +} + type GrampusTasks struct { Command string `json:"command"` Name string `json:"name"` @@ -1487,6 +1492,11 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { return getRepoCloudBrain(cb) } +func GetCloudbrainByJobIDWithDeleted(jobID string) (*Cloudbrain, error) { + cb := &Cloudbrain{JobID: jobID} + return getRepoCloudBrainWithDeleted(cb) +} + func GetCloudbrainByID(id string) (*Cloudbrain, error) { idInt64, _ := strconv.ParseInt(id, 10, 64) cb := &Cloudbrain{ID: idInt64} diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index dc1d0e4609..0e62b71d5c 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -52,7 +52,6 @@ func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, er return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() } else { - log.Info("is job creator:" + strconv.FormatBool(ctx.User.ID == job.UserID)) return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID } diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 6f1ee72d67..0d87f390d0 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -1,16 +1,14 @@ package grampus import ( - "crypto/tls" - "encoding/json" - "fmt" - "net/http" - "strconv" - "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" + "crypto/tls" + "encoding/json" + "fmt" "github.com/go-resty/resty/v2" + "net/http" ) var ( @@ -129,11 +127,11 @@ func GetJob(jobID string) (*models.GetGrampusJobResponse, error) { client := getRestyClient() var result models.GetGrampusJobResponse + log.Info(jobID, TOKEN) retry := 0 sendjob: _, err := client.R(). - SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). SetResult(&result). Get(HOST + urlTrainJob + "/" + jobID) @@ -157,101 +155,6 @@ sendjob: return &result, nil } -func DelJob(jobID string) (*models.NotebookDelResult, error) { - checkSetting() - client := getRestyClient() - var result models.NotebookDelResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) - - if err != nil { - return &result, fmt.Errorf("resty DelJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - var response models.NotebookResult - err = json.Unmarshal(res.Body(), &response) - if err != nil { - log.Error("json.Unmarshal failed: %s", err.Error()) - return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) - } - - if len(response.ErrorCode) != 0 { - log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) - } - - return &result, nil -} - -func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { - checkSetting() - client := getRestyClient() - var result models.CreateTrainJobResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetBody(createJobParams). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) - - if err != nil { - return nil, fmt.Errorf("resty create train-job: %s", err) - } - - req, _ := json.Marshal(createJobParams) - log.Info("%s", req) - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'." - DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'." - if temp.ErrorMsg == BootFileErrorMsg { - log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("启动文件错误!") - } - if temp.ErrorMsg == DataSetErrorMsg { - log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("数据集错误!") - } - return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) - } - - return &result, nil -} - func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { checkSetting() client := getRestyClient() @@ -294,309 +197,63 @@ sendjob: return &result, nil } -func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { +func GetTrainJobLog(jobID string) (string, error) { checkSetting() client := getRestyClient() - var result models.GetTrainJobResult - - retry := 0 + var logContent string -sendjob: res, err := client.R(). SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + SetResult(&logContent). + Get(HOST + urlTrainJob + "/" + jobID + "/log") if err != nil { - return nil, fmt.Errorf("resty GetTrainJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob + return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err) } if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult + var temp models.GrampusResult if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("GetTrainJob(%s) failed", jobID) - return &result, fmt.Errorf("获取作业详情失败") - } - - return &result, nil -} - -func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) { - checkSetting() - client := getRestyClient() - var result models.GetTrainJobLogResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetQueryParams(map[string]string{ - "base_line": baseLine, - "lines": strconv.Itoa(lines), - "log_file": logFile, - "order": order, - }). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log") - - if err != nil { - return nil, fmt.Errorf("resty GetTrainJobLog: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return logContent, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) } log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("GetTrainJobLog(%s) failed", jobID) - return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg) - } - - return &result, nil -} - -func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) { - checkSetting() - client := getRestyClient() - var result models.GetTrainJobLogFileNamesResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetAuthToken(TOKEN). - SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names") - - if err != nil { - return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return logContent, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) } - if !result.IsSuccess { - log.Error("GetTrainJobLogFileNames(%s) failed", jobID) - return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg) - } + logContent = res.String() - return &result, nil + return logContent, nil } -func DelTrainJob(jobID string) (*models.TrainJobResult, error) { +func StopJob(jobID string) (*models.GrampusStopJobResponse, error) { checkSetting() client := getRestyClient() - var result models.TrainJobResult + var result models.GrampusStopJobResponse retry := 0 sendjob: - res, err := client.R(). - SetAuthToken(TOKEN). - SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID) - - if err != nil { - return &result, fmt.Errorf("resty DelTrainJob: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("DelTrainJob(%s) failed", jobID) - return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg) - } - - return &result, nil -} - -func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) { - checkSetting() - client := getRestyClient() - var result models.TrainJobResult - - retry := 0 - -sendjob: - res, err := client.R(). + _, err := client.R(). + //SetHeader("Content-Type", "application/json"). SetAuthToken(TOKEN). SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop") + Post(HOST + urlTrainJob + "/" + jobID + "/stop") if err != nil { return &result, fmt.Errorf("resty StopTrainJob: %v", err) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("StopTrainJob(%s) failed", jobID) - return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg) - } - - return &result, nil -} - -func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) { - checkSetting() - client := getRestyClient() - var result models.TrainJobResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetAuthToken(TOKEN). - SetResult(&result). - Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) - - if err != nil { - return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err) - } - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { - retry++ - _ = getToken() - goto sendjob - } - - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("DelTrainJob(%s) failed", jobID) - return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg) - } - - return &result, nil -} - -func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) { - checkSetting() - client := getRestyClient() - var result models.CreateTrainJobResult - - retry := 0 - -sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). - SetAuthToken(TOKEN). - SetBody(createJobParams). - SetResult(&result). - Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) - - if err != nil { - return nil, fmt.Errorf("resty create inference-job: %s", err) - } - - req, _ := json.Marshal(createJobParams) - log.Info("%s", req) - - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + if result.ErrorCode == errorIllegalToken && retry < 1 { retry++ + log.Info("retry get token") _ = getToken() goto sendjob } - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'." - DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'." - if temp.ErrorMsg == BootFileErrorMsg { - log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("启动文件错误!") - } - if temp.ErrorMsg == DataSetErrorMsg { - log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("数据集错误!") - } - return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - } - - if !result.IsSuccess { - log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + if result.ErrorCode != 0 { + log.Error("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 9a05aa8ae1..471f8be7ef 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -934,6 +934,15 @@ func RegisterRoutes(m *macaron.Macaron) { }) }) }, reqRepoReader(models.UnitTypeCloudBrain)) + m.Group("/grampus", func() { + m.Get("/:id", repo.GetCloudbrainTask) + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob) + m.Get("/log", repo_ext.GrampusGetLog) + }) + }) + }, reqRepoReader(models.UnitTypeCloudBrain)) }, repoAssignment()) }) diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 06e4bea44d..c6f4b8b264 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -199,7 +199,7 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { } if job.StartTime == 0 && result.JobInfo.StartedAt > 0 { - job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000) + job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) } job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) job.Duration = result.JobInfo.RunSec diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 7edb8c3bb5..69cd249011 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -1482,7 +1482,7 @@ func SyncCloudbrainStatus() { task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { - task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000) + task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) } if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { task.EndTime = task.StartTime.Add(task.Duration) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index c8c54a9287..ab9d712905 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -5,6 +5,7 @@ import ( "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/modelarts" + "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/util" "encoding/json" "errors" @@ -25,13 +26,13 @@ import ( ) const ( + tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show" + //GPU - tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" - tplGrampusTrainJobGPUShow base.TplName = "repo/grampus/trainjob/gpu/show" + tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new" //NPU - tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" - tplGrampusTrainJobNPUShow base.TplName = "repo/grampus/trainjob/npu/show" + tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new" ) func GrampusTrainJobGPUNew(ctx *context.Context) { @@ -368,10 +369,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain JobName: jobName, DisplayJobName: displayJobName, ComputeResource: models.NPUResource, - Command: "echo \"test\"", - ResourceSpecId: "modelarts.kat1.xlarge", + Command: "echo test", + ResourceSpecId: "f2497d54732b45fb8d887e63be1db4a7", ImageUrl: "", - ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64", + ImageId: "e6e85cd78ca24e158f71b6fac9c2fb95", DataUrl: dataPath, Description: description, @@ -412,3 +413,171 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } + +func GrampusStopJob(ctx *context.Context) { + var ID = ctx.Params(":jobid") + var resultCode = "0" + var errorMsg = "" + var status = "" + + task := ctx.Cloudbrain + for { + if task.Status == string(models.GrampusStatusStopped) || task.Status == string(models.GrampusStatusFailed) || task.Status == string(models.GrampusStatusSucceeded) { + log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + res, err := grampus.StopJob(task.JobID) + if err != nil { + log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) + resultCode = strconv.Itoa(res.ErrorCode) + errorMsg = res.ErrorMsg + break + } + + task.Status = string(models.GrampusStatusStopped) + if task.EndTime == 0 { + task.EndTime = timeutil.TimeStampNow() + } + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + + status = task.Status + break + } + + ctx.JSON(200, map[string]interface{}{ + "result_code": resultCode, + "error_msg": errorMsg, + "status": status, + "id": ID, + "StatusOK": 0, + }) +} + +func GrampusTrainJobDel(ctx *context.Context) { + var listType = ctx.Query("listType") + if err := deleteGrampusJob(ctx); err != nil { + log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"]) + ctx.ServerError(err.Error(), err) + return + } + + var isAdminPage = ctx.Query("isadminpage") + var isHomePage = ctx.Query("ishomepage") + if ctx.IsUserSiteAdmin() && isAdminPage == "true" { + ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") + } else if isHomePage == "true" { + ctx.Redirect(setting.AppSubURL + "/cloudbrains") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) + } +} + +func deleteGrampusJob(ctx *context.Context) error { + task := ctx.Cloudbrain + + if task.Status != string(models.GrampusStatusStopped) && task.Status != string(models.GrampusStatusSucceeded) && task.Status != string(models.GrampusStatusFailed) { + log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) + return errors.New("the job has not been stopped") + } + + err := models.DeleteJob(task) + if err != nil { + log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"]) + return err + } + + storageType := models.TypeCloudBrainOne + if task.ComputeResource == models.NPUResource { + storageType = models.TypeCloudBrainTwo + } + deleteJobStorage(task.JobName, storageType) + + return nil +} + +func GrampusTrainJobShow(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + //debugListType := ctx.Query("debugListType") + + var task *models.Cloudbrain + task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid")) + if err != nil { + log.Error("GetCloudbrainByJobID failed:" + err.Error()) + ctx.ServerError("system error", err) + return + } + + attachment, err := models.GetAttachmentByUUID(task.Uuid) + if err == nil { + task.DatasetName = attachment.Name + } + + taskList := make([]*models.Cloudbrain, 0) + taskList = append(taskList, task) + ctx.Data["version_list_task"] = taskList + + if task.DeletedAt.IsZero() { //normal record + result, err := grampus.GetJob(task.JobID) + if err != nil { + log.Error("GetJob failed:" + err.Error()) + ctx.ServerError("GetJob failed", err) + return + } + + if result != nil { + task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) + if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning { + task.Duration = result.JobInfo.RunSec + task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) + + if task.StartTime == 0 && result.JobInfo.StartedAt > 0 { + task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt) + } + if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { + task.EndTime = task.StartTime.Add(task.Duration) + } + task.CorrectCreateUnix() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob failed:" + err.Error()) + } + } + } + } + + ctx.HTML(http.StatusOK, tplGrampusTrainJobShow) +} + +func GrampusGetLog(ctx *context.Context) { + jobID := ctx.Params(":jobid") + job, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } + + content, err := grampus.GetTrainJobLog(job.JobID) + if err != nil { + log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobName": job.JobName, + "Content": content, + }) + + return +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index ea4ff3b1d2..6f0d2f3732 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -587,11 +587,6 @@ func TrainJobIndex(ctx *context.Context) { for i, task := range tasks { tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) - if task.Cloudbrain.Type == models.TypeCloudBrainOne { - tasks[i].ComputeResource = models.GPUResource - } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo { - tasks[i].ComputeResource = models.NPUResource - } } pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 6348e4ba8b..64566a1d7c 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1085,23 +1085,17 @@ func RegisterRoutes(m *macaron.Macaron) { }, context.RepoRef()) m.Group("/grampus", func() { m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow) + m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + }) m.Group("/gpu", func() { - m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) - m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) - m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) - }) m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew) //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate) }) m.Group("/npu", func() { - m.Group("/:jobid", func() { - m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) - m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop) - m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) - m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) - }) m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew) m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobNpuCreate) }) diff --git a/templates/repo/grampus/trainjob/gpu/show.tmpl b/templates/repo/grampus/trainjob/gpu/show.tmpl deleted file mode 100755 index f1087abcfb..0000000000 --- a/templates/repo/grampus/trainjob/gpu/show.tmpl +++ /dev/null @@ -1,731 +0,0 @@ -{{template "base/head" .}} - -
-
-
-
-
-
-
-
-
-
- {{template "repo/header" .}} -
-

- -

- {{range $k ,$v := .version_list_task}} -
- -
-
-
- - - -
- {{TimeSinceUnix1 .CreatedUnix}} - - {{$.i18n.Tr "repo.modelarts.status"}}: - {{.Status}} - - {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}: - {{$.duration}} - -
-
-
-
-
-
-
-
- -
-
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- {{$.i18n.Tr "repo.cloudbrain_task"}} - -
- {{.DisplayJobName}} -
-
- {{$.i18n.Tr "repo.modelarts.status"}} - -
- {{.Status}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} - -
- {{TimeSinceUnix1 .CreatedUnix}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} - -
- {{$.duration}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} - -
- {{$.resource_type}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.standard"}} - -
- {{$.i18n.Tr "cloudbrain.gpu_num"}}:{{$.GpuNum}},{{$.i18n.Tr "cloudbrain.cpu_num"}}:{{$.CpuNum}},{{$.i18n.Tr "cloudbrain.memory"}}(MB):{{$.MemMiB}},{{$.i18n.Tr "cloudbrain.shared_memory"}}(MB):{{$.ShareMemMiB}} -
-
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- {{$.i18n.Tr "cloudbrain.mirror"}} - -
- {{.Image}} -
-
- {{$.i18n.Tr "repo.modelarts.code_version"}} - -
- {{.BranchName}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} - -
- {{.BootFile}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} - -
- {{.DatasetName}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} - -
- {{.Parameters}} -
-
- {{$.i18n.Tr "repo.modelarts.train_job.description"}} - -
- {{.Description}} -
-
-
-
-
- -
-
- -
-
- -
- - - - - -
- -
- -
- -
-
- -
- - -

-                            
- -
- -
- -
- - - -
- -
-
- -
-
-
- {{end}} {{template "base/paginate" .}} -
- -
- -
- - -
-{{template "base/footer" .}} - - \ No newline at end of file diff --git a/templates/repo/grampus/trainjob/npu/show.tmpl b/templates/repo/grampus/trainjob/show.tmpl similarity index 90% rename from templates/repo/grampus/trainjob/npu/show.tmpl rename to templates/repo/grampus/trainjob/show.tmpl index 8f168fcf9f..e228b2ed96 100755 --- a/templates/repo/grampus/trainjob/npu/show.tmpl +++ b/templates/repo/grampus/trainjob/show.tmpl @@ -232,47 +232,15 @@
{{$.CsrfTokenHtml}} - {{if and (.CanModify) (eq .Status "COMPLETED") ($.Permission.CanWrite $.UnitTypeModelManage) }} - {{$.i18n.Tr "repo.modelarts.create_model"}} - {{else}} - {{$.i18n.Tr "repo.modelarts.create_model"}} - {{end}} - - {{if .CanModify}} - {{$.i18n.Tr "repo.modelarts.modify"}} - {{else}} - {{$.i18n.Tr "repo.modelarts.modify"}} - {{end}} - - {{if .CanDel}} - {{$.i18n.Tr "repo.stop"}} - {{else}} - {{$.i18n.Tr "repo.stop"}} - {{end}} - - - {{if .CanDel}} - {{$.i18n.Tr "repo.delete"}} - {{else}} - {{$.i18n.Tr "repo.delete"}} - {{end}} +
- {{if not (eq .Cloudbrain.StartTime 0)}} - {{TimeSinceUnix1 .Cloudbrain.StartTime}} + {{if not (eq .StartTime 0)}} + {{TimeSinceUnix1 .StartTime}} {{else}} - {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} + {{TimeSinceUnix1 .CreatedUnix}} {{end}} {{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}} @@ -355,10 +323,10 @@
- {{if not (eq .Cloudbrain.StartTime 0)}} - {{TimeSinceUnix1 .Cloudbrain.StartTime}} + {{if not (eq .StartTime 0)}} + {{TimeSinceUnix1 .StartTime}} {{else}} - {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}} + {{TimeSinceUnix1 .CreatedUnix}} {{end}}
@@ -464,8 +432,8 @@
- {{.Cloudbrain.Description}} + title="{{.Description}}"> + {{.Description}}
@@ -479,15 +447,6 @@
- - - - - - @@ -701,7 +660,7 @@ return size + unitArr[index]; } function refreshStatus(version_name) { - $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}?version_name=${version_name}`, (data) => { + $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}?version_name=${version_name}`, (data) => { // header status and duration $(`#${version_name}-duration-span`).text(data.JobDuration) $(`#${version_name}-status-span span`).text(data.JobStatus) @@ -758,7 +717,7 @@ }); } function loadLog(version_name) { - $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&lines=50&order=asc`, (data) => { + $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&lines=50&order=asc`, (data) => { $('input[name=end_line]').val(data.EndLine) $('input[name=start_line]').val(data.StartLine) $(`#log_file${version_name}`).text(data.Content) @@ -959,7 +918,7 @@ let logContentDom = document.querySelector(`#log${version_name}`) $(`#log_file${version_name}`).siblings('pre').remove() - $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=asc`, (data) => { + $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=asc`, (data) => { $(`#log${version_name} input[name=end_line]`).val(data.EndLine) //如果变动就改变所对应的值 $(`#log${version_name} input[name=start_line]`).val(data.StartLine) @@ -977,12 +936,12 @@ let version_name = $(this).data('version') let logContentDom = document.querySelector(`#log${version_name}`) $(`#log_file${version_name}`).siblings('pre').remove() - $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => { + $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => { $(`#log${version_name} input[name=end_line]`).val(data.EndLine) //如果变动就改变所对应的值 $(`#log${version_name} input[name=start_line]`).val(data.StartLine) $(`#log${version_name}`).append('
' + data.Content)
-            $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=${data.EndLine}&lines=50&order=desc`, (data) => {
+            $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&base_line=${data.EndLine}&lines=50&order=desc`, (data) => {
                 if (data.Lines == 0) {
                     $(`.message${version_name} #header`).text('您已翻阅至日志底部')
                     $(`.message${version_name}`).css('display', 'block')
diff --git a/templates/repo/modelarts/trainjob/index.tmpl b/templates/repo/modelarts/trainjob/index.tmpl
index edb146d7ec..37a547c474 100755
--- a/templates/repo/modelarts/trainjob/index.tmpl
+++ b/templates/repo/modelarts/trainjob/index.tmpl
@@ -112,7 +112,7 @@
 
                                 
                                 
- + {{.DisplayJobName}} @@ -153,7 +153,7 @@
{{$.CsrfTokenHtml}} {{if .CanDel}} - + {{$.i18n.Tr "repo.stop"}} {{else}} @@ -164,7 +164,7 @@
-
+ {{$.CsrfTokenHtml}} {{if .CanDel}} -- 2.34.1 From 6f642da6c323b70808360ff9d97a51d02d118a48 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 24 May 2022 19:51:40 +0800 Subject: [PATCH 06/56] opt --- modules/grampus/grampus.go | 7 ++++--- routers/repo/grampus.go | 38 ++++++++++++++++++++++++++------------ 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 71e368fa6f..5f580189d7 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -78,6 +78,7 @@ type GenerateTrainJobReq struct { TotalVersionCount int ComputeResource string DatasetName string + Params string } func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { @@ -119,9 +120,9 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error //EngineID: req.EngineID, TrainUrl: req.TrainUrl, BranchName: req.BranchName, - //Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, //LogUrl: req.LogUrl, //FlavorCode: req.FlavorCode, Description: req.Description, diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index ab9d712905..bf07cb0797 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -236,8 +236,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) - //todo:del - jobName = displayJobName uuid := form.Attachment description := form.Description bootFile := form.BootFile @@ -252,8 +250,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain VersionCount := modelarts.VersionCount EngineName := form.EngineName - log.Info(jobName) - count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -387,10 +383,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //PoolID: poolID, Uuid: uuid, //Parameters: param, - CommitID: commitID, - IsLatestVersion: isLatestVersion, - BranchName: branchName, - //Params: form.Params, + CommitID: commitID, + IsLatestVersion: isLatestVersion, + BranchName: branchName, + Params: form.Params, FlavorName: FlavorName, EngineName: EngineName, VersionCount: VersionCount, @@ -507,7 +503,6 @@ func deleteGrampusJob(ctx *context.Context) error { func GrampusTrainJobShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true - //debugListType := ctx.Query("debugListType") var task *models.Cloudbrain task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid")) @@ -522,9 +517,24 @@ func GrampusTrainJobShow(ctx *context.Context) { task.DatasetName = attachment.Name } - taskList := make([]*models.Cloudbrain, 0) - taskList = append(taskList, task) - ctx.Data["version_list_task"] = taskList + if len(task.Parameters) > 0 { + var parameters models.Parameters + err := json.Unmarshal([]byte(task.Parameters), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) + ctx.ServerError("system error", err) + return + } + + if len(parameters.Parameter) > 0 { + paramTemp := "" + for _, Parameter := range parameters.Parameter { + param := Parameter.Label + " = " + Parameter.Value + "; " + paramTemp = paramTemp + param + } + task.Parameters = paramTemp[:len(paramTemp)-2] + } + } if task.DeletedAt.IsZero() { //normal record result, err := grampus.GetJob(task.JobID) @@ -555,6 +565,10 @@ func GrampusTrainJobShow(ctx *context.Context) { } } + taskList := make([]*models.Cloudbrain, 0) + taskList = append(taskList, task) + ctx.Data["version_list_task"] = taskList + ctx.HTML(http.StatusOK, tplGrampusTrainJobShow) } -- 2.34.1 From 10e261c2e8a760a3e849952c0be4061da269c000 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Wed, 25 May 2022 18:34:37 +0800 Subject: [PATCH 07/56] create --- models/cloudbrain.go | 28 +++ modules/auth/grampus.go | 22 +- modules/grampus/grampus.go | 48 ++--- modules/grampus/resty.go | 59 ++++-- routers/repo/grampus.go | 210 ++++++++----------- templates/repo/grampus/trainjob/npu/new.tmpl | 31 +-- 6 files changed, 199 insertions(+), 199 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 6a6645d6b4..86af80235a 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -146,6 +146,7 @@ type Cloudbrain struct { PreVersionName string //父版本名称 ComputeResource string //计算资源,例如npu EngineID int64 //引擎id + ImageID string //grampus image_id TrainUrl string //输出模型的obs路径 BranchName string //分支名称 @@ -1180,6 +1181,33 @@ type GrampusJobInfo struct { Tasks []GrampusTasks `json:"tasks"` } +type GrampusSpec struct { + CreatedAt int64 `json:"createdAt"` + UpdatedAt int64 `json:"updatedAt"` + ID string `json:"id"` + Name string `json:"name"` + ProcessorType string `json:"processorType"` +} + +type GetGrampusResourceSpecsResult struct { + GrampusResult + Infos []GrampusSpec `json:"resourceSpecs"` +} + +type GrampusImage struct { + CreatedAt int64 `json:"createdAt"` + UpdatedAt int64 `json:"updatedAt"` + ID string `json:"id"` + Name string `json:"name"` + ProcessorType string `json:"processorType"` +} + +type GetGrampusImagesResult struct { + GrampusResult + TotalSize int `json:"totalSize"` + Infos []GrampusImage `json:"images"` +} + type CreateGrampusJobResponse struct { GrampusResult JobInfo GrampusJobInfo `json:"otJob"` diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go index 2cfaf70061..b92d8d06d7 100755 --- a/modules/auth/grampus.go +++ b/modules/auth/grampus.go @@ -6,16 +6,18 @@ import ( ) type CreateGrampusTrainJobForm struct { - DisplayJobName string `form:"display_job_name" binding:"Required"` - JobName string `form:"job_name" binding:"Required"` - Attachment string `form:"attachment" binding:"Required"` - BootFile string `form:"boot_file" binding:"Required"` - Flavor string `form:"flavor" binding:"Required"` - Params string `form:"run_para_list" binding:"Required"` - Description string `form:"description"` - BranchName string `form:"branch_name" binding:"Required"` - FlavorName string `form:"flaver_names" binding:"Required"` - EngineName string `form:"engine_names" binding:"Required"` + DisplayJobName string `form:"display_job_name" binding:"Required"` + JobName string `form:"job_name" binding:"Required"` + Attachment string `form:"attachment" binding:"Required"` + BootFile string `form:"boot_file" binding:"Required"` + ImageID string `form:"image_id" binding:"Required"` + FlavorID string `form:"flavor" binding:"Required"` + Params string `form:"run_para_list" binding:"Required"` + Description string `form:"description"` + BranchName string `form:"branch_name" binding:"Required"` + FlavorName string `form:"flaver_names" binding:"Required"` + EngineName string `form:"engine_names" binding:"Required"` + WorkServerNumber int `form:"work_server_number" binding:"Required"` } func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 5f580189d7..13280cac33 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -42,6 +42,9 @@ const ( SortByCreateTime = "create_time" ConfigTypeCustom = "custom" TotalVersionCount = 1 + + ProcessorTypeNPU = "npu.huawei.com/NPU" + ProcessorTypeGPU = "nvidia.com/gpu" ) var ( @@ -54,7 +57,7 @@ type GenerateTrainJobReq struct { JobName string Command string ResourceSpecId string - ImageUrl string + ImageUrl string //与image_id二选一,都有的情况下优先image_url ImageId string DisplayJobName string @@ -102,29 +105,26 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error jobID := jobResult.JobInfo.JobID err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.JobInfo.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: jobID, - JobName: req.JobName, - DisplayJobName: req.DisplayJobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainGrampus, - //VersionID: jobResult.VersionID, - //VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: req.DatasetName, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: req.ComputeResource, - //EngineID: req.EngineID, - TrainUrl: req.TrainUrl, - BranchName: req.BranchName, - Parameters: req.Params, - BootFile: req.BootFile, - DataUrl: req.DataUrl, - //LogUrl: req.LogUrl, - //FlavorCode: req.FlavorCode, + Status: TransTrainJobStatus(jobResult.JobInfo.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: jobID, + JobName: req.JobName, + DisplayJobName: req.DisplayJobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainGrampus, + Uuid: req.Uuid, + DatasetName: req.DatasetName, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: req.ComputeResource, + ImageID: req.ImageId, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + FlavorCode: req.ResourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 0d87f390d0..183afb853b 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -22,7 +22,8 @@ const ( urlGetToken = urlOpenApiV1 + "token" urlTrainJob = urlOpenApiV1 + "trainjob" - urlResourceSpecs = "/job/resource-specs" + urlGetResourceSpecs = urlOpenApiV1 + "resourcespec" + urlGetImages = urlOpenApiV1 + "image" urlTrainJobConfig = "/training-job-configs" errorCodeExceedLimit = "ModelArts.0118" @@ -155,43 +156,65 @@ sendjob: return &result, nil } -func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { +func GetResourceSpecs(processorType string) (*models.GetGrampusResourceSpecsResult, error) { checkSetting() client := getRestyClient() - var result models.GetResourceSpecsResult + var result models.GetGrampusResourceSpecsResult retry := 0 sendjob: - res, err := client.R(). - SetHeader("Content-Type", "application/json"). + _, err := client.R(). SetAuthToken(TOKEN). SetResult(&result). - Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) + Get(HOST + urlGetResourceSpecs + "?processorType=" + processorType) if err != nil { return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) } - if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + if result.ErrorCode == errorIllegalToken && retry < 1 { retry++ + log.Info("retry get token") _ = getToken() goto sendjob } - if res.StatusCode() != http.StatusOK { - var temp models.ErrorResult - if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { - log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) - } - log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) - return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + if result.ErrorCode != 0 { + log.Error("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + +func GetImages(processorType string) (*models.GetGrampusImagesResult, error) { + checkSetting() + client := getRestyClient() + var result models.GetGrampusImagesResult + + retry := 0 + +sendjob: + _, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + urlGetImages + "?processorType=" + processorType) + + if err != nil { + return nil, fmt.Errorf("resty GetImages: %v", err) } - if !result.IsSuccess { - log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) - return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index bf07cb0797..3f4b1361cd 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -135,18 +135,13 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) } ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec - ctx.Data["params"] = "" - ctx.Data["branchName"] = ctx.Repo.BranchName - - ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath - ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled - - ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath - ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled - ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne - - ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode") + branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) + if err != nil { + log.Error("GetBranches error:", err) + } + ctx.Data["branches"] = branches + ctx.Data["branchName"] = ctx.Repo.BranchName return nil } @@ -170,51 +165,37 @@ func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error { //get valid dataset attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID) if err != nil { - ctx.ServerError("GetAllUserAttachments failed:", err) - return err - } - ctx.Data["attachments"] = attachs - - //get valid resource specs - var resourcePools modelarts.ResourcePool - if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + log.Error("GetModelArtsTrainAttachments failed:", err.Error()) + } else { + ctx.Data["attachments"] = attachs } - ctx.Data["resource_pools"] = resourcePools.Info - var engines modelarts.Engine - if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + //get valid engines + images, err := grampus.GetImages(grampus.ProcessorTypeNPU) + if err != nil { + log.Error("GetResourceSpecs failed:", err.Error()) + } else { + ctx.Data["engine_versions"] = images.Infos } - ctx.Data["engines"] = engines.Info - var versionInfos modelarts.VersionInfo - if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + //get valid resource specs + specs, err := grampus.GetResourceSpecs(grampus.ProcessorTypeNPU) + if err != nil { + log.Error("GetResourceSpecs failed:", err.Error()) + } else { + ctx.Data["flavor_infos"] = specs.Infos } - ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err + //get branches + branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0) + if err != nil { + log.Error("GetBranches error:", err.Error()) + } else { + ctx.Data["branches"] = branches } - ctx.Data["flavor_infos"] = flavorInfos.Info - ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName - configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) - if err != nil { - ctx.ServerError("getConfigList failed:", err) - return err - } - ctx.Data["config_list"] = configList.ParaConfigs - ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo - return nil } @@ -246,10 +227,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion - FlavorName := form.FlavorName - VersionCount := modelarts.VersionCount - EngineName := form.EngineName + flavorName := form.FlavorName + versionCount := modelarts.VersionCount + engineName := form.EngineName + //check count limit count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) if err != nil { log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -265,12 +247,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check param if err := grampusParamCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) grampusTrainJobNpuNewDataPrepare(ctx) ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) return } + //check whether the task name in the project is duplicated tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName) if err == nil { @@ -295,9 +279,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain os.RemoveAll(codeLocalPath) } - gitRepo, _ := git.OpenRepository(repo.RepoPath()) - commitID, _ := gitRepo.GetBranchCommitID(branchName) - if err := downloadCode(repo, codeLocalPath, branchName); err != nil { log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) @@ -321,7 +302,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { - // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) @@ -330,9 +310,9 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //prepare command //todo: download code, download dataset, unzip dataset, exec code, upload model + command, err := generateCommand(grampus.ProcessorTypeNPU, codeObsPath, dataPath, params, "") var parameters models.Parameters param := make([]models.Parameter, 0) - existDeviceTarget := false if len(params) != 0 { err := json.Unmarshal([]byte(params), ¶meters) if err != nil { @@ -343,63 +323,45 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } for _, parameter := range parameters.Parameter { - if parameter.Label == modelarts.DeviceTarget { - existDeviceTarget = true - } - if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { - param = append(param, models.Parameter{ - Label: parameter.Label, - Value: parameter.Value, - }) - } + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) } } - if !existDeviceTarget { - param = append(param, models.Parameter{ - Label: modelarts.DeviceTarget, - Value: modelarts.Ascend, - }) - } + param = append(param, models.Parameter{ + Label: modelarts.DeviceTarget, + Value: modelarts.Ascend, + }) + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ - JobName: jobName, - DisplayJobName: displayJobName, - ComputeResource: models.NPUResource, - Command: "echo test", - ResourceSpecId: "f2497d54732b45fb8d887e63be1db4a7", - ImageUrl: "", - ImageId: "e6e85cd78ca24e158f71b6fac9c2fb95", - - DataUrl: dataPath, - Description: description, - CodeObsPath: codeObsPath, - BootFileUrl: codeObsPath + bootFile, - BootFile: bootFile, - //TrainUrl: outputObsPath, - //FlavorCode: flavorCode, - WorkServerNumber: 1, - //EngineID: int64(engineID), - //LogUrl: logObsPath, - //PoolID: poolID, - Uuid: uuid, - //Parameters: param, + JobName: jobName, + DisplayJobName: displayJobName, + ComputeResource: models.NPUResource, + Command: command, + ResourceSpecId: form.FlavorID, + ImageUrl: "", + ImageId: form.ImageID, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + WorkServerNumber: form.WorkServerNumber, + Uuid: uuid, CommitID: commitID, IsLatestVersion: isLatestVersion, BranchName: branchName, Params: form.Params, - FlavorName: FlavorName, - EngineName: EngineName, - VersionCount: VersionCount, + FlavorName: flavorName, + EngineName: engineName, + VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, } - //将params转换Parameters.Parameter,出错时返回给前端 - var Parameters modelarts.Parameters - if err := json.Unmarshal([]byte(params), &Parameters); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return - } - err = grampus.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) @@ -517,25 +479,6 @@ func GrampusTrainJobShow(ctx *context.Context) { task.DatasetName = attachment.Name } - if len(task.Parameters) > 0 { - var parameters models.Parameters - err := json.Unmarshal([]byte(task.Parameters), ¶meters) - if err != nil { - log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) - ctx.ServerError("system error", err) - return - } - - if len(parameters.Parameter) > 0 { - paramTemp := "" - for _, Parameter := range parameters.Parameter { - param := Parameter.Label + " = " + Parameter.Value + "; " - paramTemp = paramTemp + param - } - task.Parameters = paramTemp[:len(paramTemp)-2] - } - } - if task.DeletedAt.IsZero() { //normal record result, err := grampus.GetJob(task.JobID) if err != nil { @@ -565,6 +508,25 @@ func GrampusTrainJobShow(ctx *context.Context) { } } + if len(task.Parameters) > 0 { + var parameters models.Parameters + err := json.Unmarshal([]byte(task.Parameters), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) + ctx.ServerError("system error", err) + return + } + + if len(parameters.Parameter) > 0 { + paramTemp := "" + for _, Parameter := range parameters.Parameter { + param := Parameter.Label + " = " + Parameter.Value + "; " + paramTemp = paramTemp + param + } + task.Parameters = paramTemp[:len(paramTemp)-2] + } + } + taskList := make([]*models.Cloudbrain, 0) taskList = append(taskList, task) ctx.Data["version_list_task"] = taskList @@ -595,3 +557,13 @@ func GrampusGetLog(ctx *context.Context) { return } + +func generateCommand(processorType, codePath, dataPath, params, outputPath string) (string, error) { + var command string + //download code + //download dataset + //unzip dataset + //exec code + //upload models + return command, nil +} diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl index 6f5f5455f0..9e5ba39bcc 100755 --- a/templates/repo/grampus/trainjob/npu/new.tmpl +++ b/templates/repo/grampus/trainjob/npu/new.tmpl @@ -136,18 +136,10 @@
-
- -
-
- {{range .engine_versions}} - + {{end}} @@ -175,22 +167,6 @@ {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
- {{if ne 0 (len .params)}} - {{range $k ,$v := .params}} -
-
- -
-
- -
- - - - -
- {{end}} - {{end}}
@@ -224,7 +200,7 @@
@@ -237,7 +213,6 @@
-- 2.34.1 From 685a14ba1930c5643a352326b4c47d40ae4fe2a5 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Thu, 26 May 2022 20:54:02 +0800 Subject: [PATCH 08/56] generate command --- modules/grampus/grampus.go | 15 +++++------ modules/util/path.go | 10 ++++++++ routers/repo/grampus.go | 51 +++++++++++++++++++++++++++++++------- 3 files changed, 58 insertions(+), 18 deletions(-) mode change 100644 => 100755 modules/util/path.go diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 13280cac33..bde663ebf6 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -10,17 +10,11 @@ import ( ) const ( - //notebook - storageTypeOBS = "obs" - autoStopDuration = 4 * 60 * 60 - autoStopDurationMs = 4 * 60 * 60 * 1000 - - DataSetMountPath = "/home/ma-user/work" - NotebookEnv = "Python3" - NotebookType = "Ascend" - FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" + storageTypeOBS = "obs" + WorkPath = "/home/ma-user/work" CodePath = "/code/" + DatasetPath = "/dataset" OutputPath = "/output/" ResultPath = "/result/" LogPath = "/log/" @@ -45,6 +39,9 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" + + CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;" + ScriptSyncObsCodeAndDataset = "sync_obs_code_and_dataset.py" ) var ( diff --git a/modules/util/path.go b/modules/util/path.go old mode 100644 new mode 100755 index 2b198eb6dc..1db6e43793 --- a/modules/util/path.go +++ b/modules/util/path.go @@ -31,3 +31,13 @@ func GetDirectorySize(path string) (int64, error) { }) return size, err } + +// check whether the path is dir +func IsDir(path string) bool { + s, err := os.Stat(path) + if err != nil { + return false + } + + return s.IsDir() +} diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 3f4b1361cd..c16363da49 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -9,6 +9,7 @@ import ( "code.gitea.io/gitea/modules/util" "encoding/json" "errors" + "fmt" "io/ioutil" "net/http" "os" @@ -273,6 +274,15 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } } + //check dataset + attachment, err := models.GetAttachmentByUUID(uuid) + if err != nil { + log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) + grampusTrainJobNpuNewDataPrepare(ctx) + ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form) + return + } + //prepare code and out path _, err = ioutil.ReadDir(codeLocalPath) if err == nil { @@ -310,7 +320,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //prepare command //todo: download code, download dataset, unzip dataset, exec code, upload model - command, err := generateCommand(grampus.ProcessorTypeNPU, codeObsPath, dataPath, params, "") + command, err := generateCommand(grampus.ProcessorTypeNPU, "obs:/"+codeObsPath, "obs:/"+dataPath, params, "", attachment.Name) + log.Info(command) var parameters models.Parameters param := make([]models.Parameter, 0) if len(params) != 0 { @@ -360,6 +371,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain EngineName: engineName, VersionCount: versionCount, TotalVersionCount: modelarts.TotalVersionCount, + DatasetName: attachment.Name, } err = grampus.GenerateTrainJob(ctx, req) @@ -474,11 +486,6 @@ func GrampusTrainJobShow(ctx *context.Context) { return } - attachment, err := models.GetAttachmentByUUID(task.Uuid) - if err == nil { - task.DatasetName = attachment.Name - } - if task.DeletedAt.IsZero() { //normal record result, err := grampus.GetJob(task.JobID) if err != nil { @@ -524,6 +531,8 @@ func GrampusTrainJobShow(ctx *context.Context) { paramTemp = paramTemp + param } task.Parameters = paramTemp[:len(paramTemp)-2] + } else { + task.Parameters = "" } } @@ -558,12 +567,36 @@ func GrampusGetLog(ctx *context.Context) { return } -func generateCommand(processorType, codePath, dataPath, params, outputPath string) (string, error) { +func generateCommand(processorType, codeObsPath, dataObsPath, params, outputPath, datasetName string) (string, error) { var command string - //download code - //download dataset + + command += grampus.CommandPrepareScript + //download code & dataset + if processorType == grampus.ProcessorTypeNPU { + commandDownload := "python " + grampus.ScriptSyncObsCodeAndDataset + " --access_key=" + setting.AccessKeyID + " --secret_key=" + setting.SecretAccessKey + " --project_id=" + setting.ProjectID + " --region_name=" + setting.Location + " --code_obs_dir=" + codeObsPath + " --data_obs_dir=" + dataObsPath + " --dataset_name=" + datasetName + ";" + command += commandDownload + } else if processorType == grampus.ProcessorTypeGPU { + + } + //unzip dataset //exec code //upload models return command, nil } + +func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, error) { + var command string + + command = "python;" + command += "from modelarts.session import Session \n" + command += fmt.Sprintf("session = Session(access_key='%s',secret_key='%s', project_id='%s', region_name='%s') \n", setting.AccessKeyID, setting.SecretAccessKey, setting.ProjectID, setting.Location) + + if util.IsDir(srcObsFile) { + command += fmt.Sprintf("session.obs.download_dir(src_obs_dir=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir) + } else { + command += fmt.Sprintf("session.obs.download_file(src_obs_file=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir) + } + + return command, nil +} -- 2.34.1 From d2ed49fbacee9bd920adf5f06e42a1341a4e6739 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 30 May 2022 16:01:43 +0800 Subject: [PATCH 09/56] gen command --- modules/grampus/grampus.go | 6 +- routers/repo/grampus.go | 115 ++++++++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 36 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index bde663ebf6..45ff979a5a 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -18,7 +18,7 @@ const ( OutputPath = "/output/" ResultPath = "/result/" LogPath = "/log/" - JobPath = "/job/" + JobPath = "job/" OrderDesc = "desc" //向下查询 OrderAsc = "asc" //向上查询 Lines = 500 @@ -40,8 +40,8 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" - CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;" - ScriptSyncObsCodeAndDataset = "sync_obs_code_and_dataset.py" + CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm;" + CodeArchiveName = "master.zip" ) var ( diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index c16363da49..d3c8642ba5 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -10,6 +10,7 @@ import ( "encoding/json" "errors" "fmt" + "github.com/unknwon/com" "io/ioutil" "net/http" "os" @@ -224,8 +225,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain params := form.Params repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath - codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath - dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + codeObsPath := grampus.JobPath + jobName + modelarts.CodePath + dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName isLatestVersion := modelarts.IsLatestVersion flavorName := form.FlavorName @@ -289,8 +290,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain os.RemoveAll(codeLocalPath) } - if err := downloadCode(repo, codeLocalPath, branchName); err != nil { - log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err) + if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { + log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form) return @@ -320,33 +321,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //prepare command //todo: download code, download dataset, unzip dataset, exec code, upload model - command, err := generateCommand(grampus.ProcessorTypeNPU, "obs:/"+codeObsPath, "obs:/"+dataPath, params, "", attachment.Name) + command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, "", attachment.Name) log.Info(command) - var parameters models.Parameters - param := make([]models.Parameter, 0) - if len(params) != 0 { - err := json.Unmarshal([]byte(params), ¶meters) - if err != nil { - log.Error("Failed to Unmarshal params: %s (%v)", params, err) - grampusTrainJobNpuNewDataPrepare(ctx) - ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form) - return - } - for _, parameter := range parameters.Parameter { - param = append(param, models.Parameter{ - Label: parameter.Label, - Value: parameter.Value, - }) - } - } - param = append(param, models.Parameter{ - Label: modelarts.DeviceTarget, - Value: modelarts.Ascend, - }) - - gitRepo, _ := git.OpenRepository(repo.RepoPath()) - commitID, _ := gitRepo.GetBranchCommitID(branchName) + commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) req := &grampus.GenerateTrainJobReq{ JobName: jobName, @@ -356,7 +334,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain ResourceSpecId: form.FlavorID, ImageUrl: "", ImageId: form.ImageID, - DataUrl: dataPath, + DataUrl: dataObsPath, Description: description, CodeObsPath: codeObsPath, BootFileUrl: codeObsPath + bootFile, @@ -567,21 +545,47 @@ func GrampusGetLog(ctx *context.Context) { return } -func generateCommand(processorType, codeObsPath, dataObsPath, params, outputPath, datasetName string) (string, error) { +func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile, paramSrc, outputPath, datasetName string) (string, error) { var command string command += grampus.CommandPrepareScript //download code & dataset if processorType == grampus.ProcessorTypeNPU { - commandDownload := "python " + grampus.ScriptSyncObsCodeAndDataset + " --access_key=" + setting.AccessKeyID + " --secret_key=" + setting.SecretAccessKey + " --project_id=" + setting.ProjectID + " --region_name=" + setting.Location + " --code_obs_dir=" + codeObsPath + " --data_obs_dir=" + dataObsPath + " --dataset_name=" + datasetName + ";" + commandDownload := "./sync_for_arm " + setting.Bucket + " " + codeObsPath + " " + grampus.CodeArchiveName + " " + dataObsPath + " " + datasetName + ";" command += commandDownload } else if processorType == grampus.ProcessorTypeGPU { } - //unzip dataset + //unzip code & dataset + commandUnzip := "cd dataset;unzip " + datasetName + ";cd ../code;unzip master.zip;" + command += commandUnzip + //exec code + var parameters models.Parameters + var paramCode string + param := make([]models.Parameter, 0) + if len(paramSrc) != 0 { + err := json.Unmarshal([]byte(paramSrc), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err) + return command, err + } + + for _, parameter := range parameters.Parameter { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + paramCode += " --" + parameter.Label + "=" + parameter.Value + } + } + + commandCode := "cd " + repoName + ";python " + bootFile + paramCode + command += commandCode + //upload models + return command, nil } @@ -600,3 +604,48 @@ func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, err return command, nil } + +func downloadZipCode(ctx *context.Context, codePath, branchName string) error { + archiveType := git.ZIP + archivePath := codePath + + if !com.IsDir(archivePath) { + if err := os.MkdirAll(archivePath, os.ModePerm); err != nil { + log.Error("MkdirAll failed:" + err.Error()) + return err + } + } + + // Get corresponding commit. + var ( + commit *git.Commit + err error + ) + + gitRepo := ctx.Repo.GitRepo + if err != nil { + log.Error("OpenRepository failed:" + err.Error()) + return err + } + + if gitRepo.IsBranchExist(branchName) { + commit, err = gitRepo.GetBranchCommit(branchName) + if err != nil { + log.Error("GetBranchCommit failed:" + err.Error()) + return err + } + } + + archivePath = path.Join(archivePath, grampus.CodeArchiveName) + if !com.IsFile(archivePath) { + if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{ + Format: archiveType, + Prefix: setting.Repository.PrefixArchiveFiles, + }); err != nil { + log.Error("CreateArchive failed:" + err.Error()) + return err + } + } + + return nil +} -- 2.34.1 From 8eeaf779354419cef77cac29f6435839154a4d77 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 30 May 2022 20:14:42 +0800 Subject: [PATCH 10/56] debug --- modules/grampus/grampus.go | 2 +- routers/repo/grampus.go | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 45ff979a5a..4c46296e5d 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -40,7 +40,7 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" - CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm;" + CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" CodeArchiveName = "master.zip" ) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index d3c8642ba5..552fc9b705 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -216,7 +216,6 @@ func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error } func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) { - VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount) displayJobName := form.DisplayJobName jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment @@ -298,20 +297,13 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //todo: upload code (send to file_server todo this work?) - if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form) return } - if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { - log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) - grampusTrainJobNpuNewDataPrepare(ctx) - ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form) - return - } - if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) grampusTrainJobNpuNewDataPrepare(ctx) @@ -321,7 +313,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //prepare command //todo: download code, download dataset, unzip dataset, exec code, upload model - command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, "", attachment.Name) + command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) log.Info(command) commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) @@ -545,7 +537,7 @@ func GrampusGetLog(ctx *context.Context) { return } -func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile, paramSrc, outputPath, datasetName string) (string, error) { +func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile, paramSrc, outputObsPath, datasetName string) (string, error) { var command string command += grampus.CommandPrepareScript @@ -558,7 +550,7 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile } //unzip code & dataset - commandUnzip := "cd dataset;unzip " + datasetName + ";cd ../code;unzip master.zip;" + commandUnzip := "cd /tmp/dataset;unzip " + datasetName + ";cd /tmp/code;unzip master.zip;" command += commandUnzip //exec code @@ -581,10 +573,12 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile } } - commandCode := "cd " + repoName + ";python " + bootFile + paramCode + commandCode := "cd " + repoName + ";python " + bootFile + paramCode + ";" command += commandCode //upload models + commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_grampus " + setting.Bucket + " " + outputObsPath + " " + "/tmp/output/;" + command += commandUpload return command, nil } -- 2.34.1 From e46202f0acbc6d74981f461eb8d2ad5489579985 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Mon, 30 May 2022 20:54:08 +0800 Subject: [PATCH 11/56] debug --- modules/grampus/grampus.go | 2 +- routers/repo/grampus.go | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 4c46296e5d..95985e533b 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -40,7 +40,7 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" - CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" + CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" CodeArchiveName = "master.zip" ) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 552fc9b705..542b1c3865 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -312,9 +312,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //prepare command - //todo: download code, download dataset, unzip dataset, exec code, upload model command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) - log.Info(command) commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) @@ -550,7 +548,12 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile } //unzip code & dataset - commandUnzip := "cd /tmp/dataset;unzip " + datasetName + ";cd /tmp/code;unzip master.zip;" + toolUnzip := "unzip -q " + if strings.HasSuffix(datasetName, ".tar.gz") { + toolUnzip = "tar -zxvf " + } + commandUnzip := "cd /tmp/dataset;" + toolUnzip + datasetName + ";cd /tmp/code;unzip -q master.zip;" + commandUnzip += "cd /tmp/dataset/" + strings.TrimSuffix(datasetName, ".zip") + ";ls;" command += commandUnzip //exec code @@ -573,7 +576,7 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile } } - commandCode := "cd " + repoName + ";python " + bootFile + paramCode + ";" + commandCode := "cd /tmp/code/" + repoName + ";python " + bootFile + paramCode + ";" command += commandCode //upload models -- 2.34.1 From 99b2c851255c8bad881300517a8feeb59cf072f1 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 31 May 2022 18:02:30 +0800 Subject: [PATCH 12/56] view --- models/cloudbrain.go | 14 +++++--- modules/grampus/grampus.go | 3 +- modules/grampus/resty.go | 2 +- options/locale/locale_en-US.ini | 5 +++ options/locale/locale_zh-CN.ini | 5 +++ routers/api/v1/api.go | 2 +- routers/repo/cloudbrain.go | 3 ++ routers/repo/grampus.go | 38 ++++++++++---------- routers/routes/routes.go | 2 +- templates/repo/cloudbrain/trainjob/new.tmpl | 13 +++++++ templates/repo/grampus/trainjob/gpu/new.tmpl | 13 +++++++ templates/repo/grampus/trainjob/npu/new.tmpl | 13 +++++++ templates/repo/grampus/trainjob/show.tmpl | 13 ++++++- templates/repo/modelarts/trainjob/new.tmpl | 13 +++++++ 14 files changed, 110 insertions(+), 29 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 06fbea5b3f..97fa69e0d7 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -147,6 +147,7 @@ type Cloudbrain struct { ComputeResource string //计算资源,例如npu EngineID int64 //引擎id ImageID string //grampus image_id + AiCenter string //grampus ai center: center_id+center_name TrainUrl string //输出模型的obs路径 BranchName string //分支名称 @@ -1224,11 +1225,14 @@ type GrampusStopJobResponse struct { } type GrampusTasks struct { - Command string `json:"command"` - Name string `json:"name"` - ImageId string `json:"imageId"` - ResourceSpecId string `json:"resourceSpecId"` - ImageUrl string `json:"imageUrl"` + Command string `json:"command"` + Name string `json:"name"` + ImageId string `json:"imageId"` + ResourceSpecId string `json:"resourceSpecId"` + ImageUrl string `json:"imageUrl"` + CenterID []string `json:"centerID"` + CenterName []string `json:"centerName"` + ReplicaNum int `json:"replicaNum"` } type CreateGrampusJobRequest struct { diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 95985e533b..a0f398115f 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -40,7 +40,7 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" - CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" + CommandPrepareScript = "cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;" CodeArchiveName = "master.zip" ) @@ -92,6 +92,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error ResourceSpecId: req.ResourceSpecId, ImageId: req.ImageId, ImageUrl: req.ImageUrl, + ReplicaNum: 0, }, }, }) diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 183afb853b..bd64ace8f4 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -228,7 +228,7 @@ func GetTrainJobLog(jobID string) (string, error) { res, err := client.R(). SetAuthToken(TOKEN). SetResult(&logContent). - Get(HOST + urlTrainJob + "/" + jobID + "/log") + Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log") if err != nil { return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index c52a369ce4..3166dafb7d 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1170,6 +1170,8 @@ model.manage.sava_model = Sava Model model.manage.model_manage = ModelManage model.manage.model_accuracy = Model Accuracy +grampus.train_job.ai_center = AI Center + template.items = Template Items template.git_content = Git Content (Default Branch) template.git_hooks = Git Hooks @@ -3013,6 +3015,9 @@ Platform_Tutorial = Tutorial foot.advice_feedback = Feedback [cloudbrain] +resource_cluster = Resource Cluster +resource_cluster_openi = OpenI Resource Cluster +resource_cluster_c2net = China Computing NET compute_resource = Computing resources task_name = Task name task_type = Task type diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index cb1c7565a7..e9b6a52803 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1180,6 +1180,8 @@ model.manage.sava_model = 保存模型 model.manage.model_manage = 模型管理 model.manage.model_accuracy = 模型精度 +grampus.train_job.ai_center=ai计算中心 + template.items=模板选项 template.git_content=Git数据(默认分支) template.git_hooks=Git 钩子 @@ -3023,6 +3025,9 @@ Platform_Tutorial=新手指引 foot.advice_feedback = 意见反馈 [cloudbrain] +resource_cluster = 算力集群 +resource_cluster_openi = 启智集群 +resource_cluster_c2net = 智算集群 compute_resource = 计算资源 task_name = 任务名称 task_type = 任务类型 diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 471f8be7ef..f6153e811c 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -935,9 +935,9 @@ func RegisterRoutes(m *macaron.Macaron) { }) }, reqRepoReader(models.UnitTypeCloudBrain)) m.Group("/grampus", func() { - m.Get("/:id", repo.GetCloudbrainTask) m.Group("/train-job", func() { m.Group("/:jobid", func() { + m.Get("", repo.GetModelArtsTrainJobVersion) m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob) m.Get("/log", repo_ext.GrampusGetLog) }) diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 69cd249011..525dd07bb3 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -1477,6 +1477,9 @@ func SyncCloudbrainStatus() { } if result != nil { + if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { + task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) task.Duration = result.JobInfo.RunSec task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 542b1c3865..d25a75c688 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -9,7 +9,6 @@ import ( "code.gitea.io/gitea/modules/util" "encoding/json" "errors" - "fmt" "github.com/unknwon/com" "io/ioutil" "net/http" @@ -458,11 +457,14 @@ func GrampusTrainJobShow(ctx *context.Context) { result, err := grampus.GetJob(task.JobID) if err != nil { log.Error("GetJob failed:" + err.Error()) - ctx.ServerError("GetJob failed", err) - return + //ctx.ServerError("GetJob failed", err) + //return } if result != nil { + if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 { + task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0] + } task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status) if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning { task.Duration = result.JobInfo.RunSec @@ -508,6 +510,13 @@ func GrampusTrainJobShow(ctx *context.Context) { taskList = append(taskList, task) ctx.Data["version_list_task"] = taskList + ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task) + + aiCenterInfo := strings.Split(task.AiCenter, "+") + if len(aiCenterInfo) == 2 { + ctx.Data["ai_center"] = aiCenterInfo[1] + } + ctx.HTML(http.StatusOK, tplGrampusTrainJobShow) } @@ -553,7 +562,6 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile toolUnzip = "tar -zxvf " } commandUnzip := "cd /tmp/dataset;" + toolUnzip + datasetName + ";cd /tmp/code;unzip -q master.zip;" - commandUnzip += "cd /tmp/dataset/" + strings.TrimSuffix(datasetName, ".zip") + ";ls;" command += commandUnzip //exec code @@ -579,25 +587,17 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile commandCode := "cd /tmp/code/" + repoName + ";python " + bootFile + paramCode + ";" command += commandCode + //get exec result + commandGetRes := "result=$?;" + command += commandGetRes + //upload models commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_grampus " + setting.Bucket + " " + outputObsPath + " " + "/tmp/output/;" command += commandUpload - return command, nil -} - -func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, error) { - var command string - - command = "python;" - command += "from modelarts.session import Session \n" - command += fmt.Sprintf("session = Session(access_key='%s',secret_key='%s', project_id='%s', region_name='%s') \n", setting.AccessKeyID, setting.SecretAccessKey, setting.ProjectID, setting.Location) - - if util.IsDir(srcObsFile) { - command += fmt.Sprintf("session.obs.download_dir(src_obs_dir=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir) - } else { - command += fmt.Sprintf("session.obs.download_file(src_obs_file=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir) - } + //check exec result + commandCheckRes := " [[ result -eq 0 ]] && echo success || ls failed;" + command += commandCheckRes return command, nil } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 64566a1d7c..ab9f12205a 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1089,7 +1089,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow) m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob) m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel) - m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload) }) m.Group("/gpu", func() { m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew) diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl index 39315cfad5..4eff7c21c5 100755 --- a/templates/repo/cloudbrain/trainjob/new.tmpl +++ b/templates/repo/cloudbrain/trainjob/new.tmpl @@ -82,6 +82,19 @@

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+