From 00e8965d3864f9933b34ae73a2647ecbb0f91e25 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Thu, 19 May 2022 15:52:58 +0800
Subject: [PATCH 01/56] init
---
routers/repo/grampus.go | 22 ++++++++++++++++++++++
routers/routes/routes.go | 12 ++++++++++++
2 files changed, 34 insertions(+)
create mode 100755 routers/repo/grampus.go
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
new file mode 100755
index 0000000000..3e8329dfbb
--- /dev/null
+++ b/routers/repo/grampus.go
@@ -0,0 +1,22 @@
+package repo
+
+import (
+ "code.gitea.io/gitea/modules/base"
+ "net/http"
+
+ "code.gitea.io/gitea/modules/context"
+)
+
+const (
+ tplGrampusTrainJobNew base.TplName = "repo/grampus/trainjob/new"
+ tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
+)
+
+func GrampusNew(ctx *context.Context) {
+ err := cloudBrainNewDataPrepare(ctx)
+ if err != nil {
+ ctx.ServerError("get new train-job info failed", err)
+ return
+ }
+ ctx.HTML(http.StatusOK, tplGrampusTrainJobNew)
+}
diff --git a/routers/routes/routes.go b/routers/routes/routes.go
index 4c3f5f472c..a64eb0fae8 100755
--- a/routers/routes/routes.go
+++ b/routers/routes/routes.go
@@ -1083,6 +1083,18 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate)
})
}, context.RepoRef())
+ m.Group("/grampus", func() {
+ m.Group("/train-job", func() {
+ m.Group("/:jobid", func() {
+ m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow)
+ m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
+ m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel)
+ m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
+ })
+ m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusNew)
+ m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusCreate)
+ })
+ }, context.RepoRef())
m.Group("/modelmanage", func() {
m.Post("/create_model", reqRepoModelManageWriter, repo.SaveModel)
m.Post("/create_new_model", repo.SaveNewNameModel)
--
2.34.1
From 126e07c14eb857e2bc1f3c6f3abedea0b0423a4e Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Fri, 20 May 2022 17:50:27 +0800
Subject: [PATCH 02/56] add config
---
custom/conf/app.ini.sample | 6 +
modules/grampus/grampus.go | 294 +++++
modules/grampus/resty.go | 1112 +++++++++++++++++
modules/setting/setting.go | 16 +
routers/repo/grampus.go | 425 ++++++-
routers/routes/routes.go | 26 +-
templates/repo/grampus/trainjob/gpu/new.tmpl | 447 +++++++
templates/repo/grampus/trainjob/gpu/show.tmpl | 731 +++++++++++
templates/repo/grampus/trainjob/npu/new.tmpl | 475 +++++++
templates/repo/grampus/trainjob/npu/show.tmpl | 1008 +++++++++++++++
10 files changed, 4527 insertions(+), 13 deletions(-)
create mode 100755 modules/grampus/grampus.go
create mode 100755 modules/grampus/resty.go
create mode 100755 templates/repo/grampus/trainjob/gpu/new.tmpl
create mode 100755 templates/repo/grampus/trainjob/gpu/show.tmpl
create mode 100755 templates/repo/grampus/trainjob/npu/new.tmpl
create mode 100755 templates/repo/grampus/trainjob/npu/show.tmpl
diff --git a/custom/conf/app.ini.sample b/custom/conf/app.ini.sample
index d294c88235..7a4298f6bd 100755
--- a/custom/conf/app.ini.sample
+++ b/custom/conf/app.ini.sample
@@ -1141,3 +1141,9 @@ growth_issue=0.2
growth_contributors=0.2
growth_commit=0.2
growth_comments=0.2
+
+
+[grampus]
+USERNAME =
+PASSWORD =
+SERVER_HOST =
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
new file mode 100755
index 0000000000..b60afb5cce
--- /dev/null
+++ b/modules/grampus/grampus.go
@@ -0,0 +1,294 @@
+package grampus
+
+import (
+ "code.gitea.io/gitea/modules/timeutil"
+ "strconv"
+
+ "code.gitea.io/gitea/models"
+ "code.gitea.io/gitea/modules/context"
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/notification"
+)
+
+const (
+ //notebook
+ storageTypeOBS = "obs"
+ autoStopDuration = 4 * 60 * 60
+ autoStopDurationMs = 4 * 60 * 60 * 1000
+
+ DataSetMountPath = "/home/ma-user/work"
+ NotebookEnv = "Python3"
+ NotebookType = "Ascend"
+ FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
+
+ //train-job
+ // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
+ // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
+ // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
+ // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
+ // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
+ // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
+ // "]}"
+ // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
+ // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
+ // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
+ // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
+ // "]}"
+ CodePath = "/code/"
+ OutputPath = "/output/"
+ ResultPath = "/result/"
+ LogPath = "/log/"
+ JobPath = "/job/"
+ OrderDesc = "desc" //向下查询
+ OrderAsc = "asc" //向上查询
+ Lines = 500
+ TrainUrl = "train_url"
+ DataUrl = "data_url"
+ ResultUrl = "result_url"
+ CkptUrl = "ckpt_url"
+ DeviceTarget = "device_target"
+ Ascend = "Ascend"
+ PerPage = 10
+ IsLatestVersion = "1"
+ NotLatestVersion = "0"
+ VersionCount = 1
+
+ SortByCreateTime = "create_time"
+ ConfigTypeCustom = "custom"
+ TotalVersionCount = 1
+)
+
+var (
+ poolInfos *models.PoolInfos
+ FlavorInfos *models.FlavorInfos
+ ImageInfos *models.ImageInfosModelArts
+)
+
+type GenerateTrainJobReq struct {
+ JobName string
+ DisplayJobName string
+ Uuid string
+ Description string
+ CodeObsPath string
+ BootFile string
+ BootFileUrl string
+ DataUrl string
+ TrainUrl string
+ FlavorCode string
+ LogUrl string
+ PoolID string
+ WorkServerNumber int
+ EngineID int64
+ Parameters []models.Parameter
+ CommitID string
+ IsLatestVersion string
+ Params string
+ BranchName string
+ PreVersionId int64
+ PreVersionName string
+ FlavorName string
+ VersionCount int
+ EngineName string
+ TotalVersionCount int
+}
+
+type GenerateInferenceJobReq struct {
+ JobName string
+ DisplayJobName string
+ Uuid string
+ Description string
+ CodeObsPath string
+ BootFile string
+ BootFileUrl string
+ DataUrl string
+ TrainUrl string
+ FlavorCode string
+ LogUrl string
+ PoolID string
+ WorkServerNumber int
+ EngineID int64
+ Parameters []models.Parameter
+ CommitID string
+ Params string
+ BranchName string
+ FlavorName string
+ EngineName string
+ LabelName string
+ IsLatestVersion string
+ VersionCount int
+ TotalVersionCount int
+ ModelName string
+ ModelVersion string
+ CkptName string
+ ResultUrl string
+}
+
+type VersionInfo struct {
+ Version []struct {
+ ID int `json:"id"`
+ Value string `json:"value"`
+ } `json:"version"`
+}
+
+type Flavor struct {
+ Info []struct {
+ Code string `json:"code"`
+ Value string `json:"value"`
+ } `json:"flavor"`
+}
+
+type Engine struct {
+ Info []struct {
+ ID int `json:"id"`
+ Value string `json:"value"`
+ } `json:"engine"`
+}
+
+type ResourcePool struct {
+ Info []struct {
+ ID string `json:"id"`
+ Value string `json:"value"`
+ } `json:"resource_pool"`
+}
+
+// type Parameter struct {
+// Label string `json:"label"`
+// Value string `json:"value"`
+// }
+
+// type Parameters struct {
+// Parameter []Parameter `json:"parameter"`
+// }
+
+type Parameters struct {
+ Parameter []struct {
+ Label string `json:"label"`
+ Value string `json:"value"`
+ } `json:"parameter"`
+}
+
+func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
+ createTime := timeutil.TimeStampNow()
+ jobResult, err := createTrainJob(models.CreateTrainJobParams{
+ JobName: req.JobName,
+ Description: req.Description,
+ Config: models.Config{
+ WorkServerNum: req.WorkServerNumber,
+ AppUrl: req.CodeObsPath,
+ BootFileUrl: req.BootFileUrl,
+ DataUrl: req.DataUrl,
+ EngineID: req.EngineID,
+ TrainUrl: req.TrainUrl,
+ LogUrl: req.LogUrl,
+ PoolID: req.PoolID,
+ CreateVersion: true,
+ Flavor: models.Flavor{
+ Code: req.FlavorCode,
+ },
+ Parameter: req.Parameters,
+ },
+ })
+ if err != nil {
+ log.Error("CreateJob failed: %v", err.Error())
+ return err
+ }
+
+ attach, err := models.GetAttachmentByUUID(req.Uuid)
+ if err != nil {
+ log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
+ return err
+ }
+ jobId := strconv.FormatInt(jobResult.JobID, 10)
+ err = models.CreateCloudbrain(&models.Cloudbrain{
+ Status: TransTrainJobStatus(jobResult.Status),
+ UserID: ctx.User.ID,
+ RepoID: ctx.Repo.Repository.ID,
+ JobID: jobId,
+ JobName: req.JobName,
+ DisplayJobName: req.DisplayJobName,
+ JobType: string(models.JobTypeTrain),
+ Type: models.TypeCloudBrainTwo,
+ VersionID: jobResult.VersionID,
+ VersionName: jobResult.VersionName,
+ Uuid: req.Uuid,
+ DatasetName: attach.Name,
+ CommitID: req.CommitID,
+ IsLatestVersion: req.IsLatestVersion,
+ ComputeResource: models.NPUResource,
+ EngineID: req.EngineID,
+ TrainUrl: req.TrainUrl,
+ BranchName: req.BranchName,
+ Parameters: req.Params,
+ BootFile: req.BootFile,
+ DataUrl: req.DataUrl,
+ LogUrl: req.LogUrl,
+ FlavorCode: req.FlavorCode,
+ Description: req.Description,
+ WorkServerNumber: req.WorkServerNumber,
+ FlavorName: req.FlavorName,
+ EngineName: req.EngineName,
+ VersionCount: req.VersionCount,
+ TotalVersionCount: req.TotalVersionCount,
+ CreatedUnix: createTime,
+ UpdatedUnix: createTime,
+ })
+
+ if err != nil {
+ log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
+ return err
+ }
+ notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
+ return nil
+}
+
+func TransTrainJobStatus(status int) string {
+ switch status {
+ case 0:
+ return "UNKNOWN"
+ case 1:
+ return "INIT"
+ case 2:
+ return "IMAGE_CREATING"
+ case 3:
+ return "IMAGE_FAILED"
+ case 4:
+ return "SUBMIT_TRYING"
+ case 5:
+ return "SUBMIT_FAILED"
+ case 6:
+ return "DELETE_FAILED"
+ case 7:
+ return "WAITING"
+ case 8:
+ return "RUNNING"
+ case 9:
+ return "KILLING"
+ case 10:
+ return "COMPLETED"
+ case 11:
+ return "FAILED"
+ case 12:
+ return "KILLED"
+ case 13:
+ return "CANCELED"
+ case 14:
+ return "LOST"
+ case 15:
+ return "SCALING"
+ case 16:
+ return "SUBMIT_MODEL_FAILED"
+ case 17:
+ return "DEPLOY_SERVICE_FAILED"
+ case 18:
+ return "CHECK_INIT"
+ case 19:
+ return "CHECK_RUNNING"
+ case 20:
+ return "CHECK_RUNNING_COMPLETED"
+ case 21:
+ return "CHECK_FAILED"
+
+ default:
+ return strconv.Itoa(status)
+ }
+}
diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go
new file mode 100755
index 0000000000..693ba71a1d
--- /dev/null
+++ b/modules/grampus/resty.go
@@ -0,0 +1,1112 @@
+package grampus
+
+import (
+ "crypto/tls"
+ "encoding/json"
+ "fmt"
+ "net/http"
+ "strconv"
+
+ "code.gitea.io/gitea/models"
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/setting"
+ "github.com/go-resty/resty/v2"
+)
+
+var (
+ restyClient *resty.Client
+ HOST string
+ TOKEN string
+)
+
+const (
+ urlOpenApiV1 = "/openapi/v1/"
+
+ urlGetToken = urlOpenApiV1 + "token"
+ urlNotebook = "/demanager/instances"
+ urlTrainJob = "/training-jobs"
+ urlResourceSpecs = "/job/resource-specs"
+ urlTrainJobConfig = "/training-job-configs"
+ errorCodeExceedLimit = "ModelArts.0118"
+
+ urlNotebook2 = ""
+
+ modelartsIllegalToken = ""
+)
+
+type GetTokenParams struct {
+ UserName string `json:"user_name"`
+ Password string `json:"password"`
+}
+
+type GetTokenResult struct {
+ Token string `json:"token"`
+ Expiration int64 `json:"expiration"`
+}
+
+func getRestyClient() *resty.Client {
+ if restyClient == nil {
+ restyClient = resty.New()
+ restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
+ }
+ return restyClient
+}
+
+func checkSetting() {
+ if len(HOST) != 0 && len(TOKEN) != 0 && restyClient != nil {
+ return
+ }
+
+ err := getToken()
+ if err != nil {
+ log.Error("getToken failed:%v", err)
+ }
+}
+
+func getToken() error {
+ HOST = setting.Grampus.Host
+
+ client := getRestyClient()
+ params := GetTokenParams{
+ UserName: setting.Grampus.UserName,
+ Password: setting.Grampus.Password,
+ }
+
+ var result GetTokenResult
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetBody(params).
+ SetResult(&result).
+ Post(HOST + urlGetToken)
+ if err != nil {
+ return fmt.Errorf("resty getToken: %v", err)
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ return fmt.Errorf("getToken failed:%s", res.String())
+ }
+
+ TOKEN = result.Token
+ log.Info(TOKEN)
+
+ return nil
+}
+
+func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.CreateNotebookResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetBody(createJobParams).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlNotebook)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty create notebook: %s", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if response.ErrorCode == errorCodeExceedLimit {
+ response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
+ }
+ return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetJob(jobID string) (*models.GetNotebookResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetNotebookResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetNotebook2Result
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if response.ErrorCode == modelartsIllegalToken && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+ return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.NotebookActionResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetBody(param).
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action")
+
+ if err != nil {
+ return &result, fmt.Errorf("resty StopJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.NotebookActionResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs))
+
+ if err != nil {
+ return &result, fmt.Errorf("resty ManageNotebook2: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if response.ErrorCode == modelartsIllegalToken && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+ return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func DelNotebook(jobID string) (*models.NotebookDelResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.NotebookDelResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)
+
+ if err != nil {
+ return &result, fmt.Errorf("resty DelJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func DelNotebook2(jobID string) (*models.NotebookDelResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.NotebookDelResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)
+
+ if err != nil {
+ return &result, fmt.Errorf("resty DelJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if response.ErrorCode == modelartsIllegalToken && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+ return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func DelJob(jobID string) (*models.NotebookDelResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.NotebookDelResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)
+
+ if err != nil {
+ return &result, fmt.Errorf("resty DelJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.NotebookGetJobTokenResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token")
+
+ if err != nil {
+ return &result, fmt.Errorf("resty GetJobToken: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.CreateTrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetBody(createJobParams).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty create train-job: %s", err)
+ }
+
+ req, _ := json.Marshal(createJobParams)
+ log.Info("%s", req)
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
+ DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
+ if temp.ErrorMsg == BootFileErrorMsg {
+ log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("启动文件错误!")
+ }
+ if temp.ErrorMsg == DataSetErrorMsg {
+ log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("数据集错误!")
+ }
+ return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.CreateTrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetBody(createJobVersionParams).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")
+
+ if err != nil {
+ return nil, fmt.Errorf("resty create train-job version: %s", err)
+ }
+
+ req, _ := json.Marshal(createJobVersionParams)
+ log.Info("%s", req)
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
+ DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
+ if temp.ErrorMsg == BootFileErrorMsg {
+ log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("启动文件错误!")
+ }
+ if temp.ErrorMsg == DataSetErrorMsg {
+ log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("数据集错误!")
+ }
+ return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetResourceSpecsResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetResourceSpecs: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.CreateTrainJobConfigResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetBody(req).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ //temp, _ := json.Marshal(req)
+ //log.Info("%s", temp)
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetConfigListResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetQueryParams(map[string]string{
+ "per_page": strconv.Itoa(perPage),
+ "page": strconv.Itoa(page),
+ "sortBy": sortBy,
+ "order": order,
+ "search_content": searchContent,
+ "config_type": configType,
+ }).
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetConfigList: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetParaConfig(configName, configType string) (models.GetConfigResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetConfigResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetQueryParams(map[string]string{
+ "config_type": configType,
+ }).
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName)
+
+ if err != nil {
+ return result, fmt.Errorf("resty GetParaConfig: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return result, nil
+}
+
+func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetTrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetTrainJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("GetTrainJob(%s) failed", jobID)
+ return &result, fmt.Errorf("获取作业详情失败")
+ }
+
+ return &result, nil
+}
+
+func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetTrainJobLogResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetQueryParams(map[string]string{
+ "base_line": baseLine,
+ "lines": strconv.Itoa(lines),
+ "log_file": logFile,
+ "order": order,
+ }).
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log")
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("GetTrainJobLog(%s) failed", jobID)
+ return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetTrainJobLogFileNamesResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names")
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("GetTrainJobLogFileNames(%s) failed", jobID)
+ return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func DelTrainJob(jobID string) (*models.TrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.TrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)
+
+ if err != nil {
+ return &result, fmt.Errorf("resty DelTrainJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("DelTrainJob(%s) failed", jobID)
+ return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.TrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop")
+
+ if err != nil {
+ return &result, fmt.Errorf("resty StopTrainJob: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("StopTrainJob(%s) failed", jobID)
+ return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.TrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)
+
+ if err != nil {
+ return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("DelTrainJob(%s) failed", jobID)
+ return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.CreateTrainJobResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetBody(createJobParams).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty create inference-job: %s", err)
+ }
+
+ req, _ := json.Marshal(createJobParams)
+ log.Info("%s", req)
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ if res.StatusCode() != http.StatusOK {
+ var temp models.ErrorResult
+ if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
+ log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ }
+ log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'."
+ DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'."
+ if temp.ErrorMsg == BootFileErrorMsg {
+ log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("启动文件错误!")
+ }
+ if temp.ErrorMsg == DataSetErrorMsg {
+ log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return &result, fmt.Errorf("数据集错误!")
+ }
+ return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ }
+
+ if !result.IsSuccess {
+ log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.CreateNotebookResult
+
+ retry := 0
+
+sendjob:
+ res, err := client.R().
+ SetHeader("Content-Type", "application/json").
+ SetAuthToken(TOKEN).
+ SetBody(createJobParams).
+ SetResult(&result).
+ Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty create notebook2: %s", err)
+ }
+
+ if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+
+ var response models.NotebookResult
+ err = json.Unmarshal(res.Body(), &response)
+ if err != nil {
+ log.Error("json.Unmarshal failed: %s", err.Error())
+ return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
+ }
+
+ if len(response.ErrorCode) != 0 {
+ log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if response.ErrorCode == errorCodeExceedLimit {
+ response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
+ }
+ if response.ErrorCode == modelartsIllegalToken && retry < 1 {
+ retry++
+ _ = getToken()
+ goto sendjob
+ }
+ return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ }
+
+ return &result, nil
+}
diff --git a/modules/setting/setting.go b/modules/setting/setting.go
index 5c87b68c5a..945a7c6f87 100755
--- a/modules/setting/setting.go
+++ b/modules/setting/setting.go
@@ -528,6 +528,13 @@ var (
FlavorInfos string
TrainJobFLAVORINFOS string
+ //grampus config
+ Grampus = struct {
+ Host string
+ UserName string
+ Password string
+ }{}
+
//elk config
ElkUrl string
ElkUser string
@@ -1382,6 +1389,15 @@ func NewContext() {
Course.OrgName = sec.Key("org_name").MustString("")
Course.TeamName = sec.Key("team_name").MustString("")
+ GetGrampusConfig()
+}
+
+func GetGrampusConfig() {
+ sec := Cfg.Section("grampus")
+
+ Grampus.Host = sec.Key("SERVER_HOST").MustString("")
+ Grampus.UserName = sec.Key("USERNAME").MustString("")
+ Grampus.Password = sec.Key("PASSWORD").MustString("")
}
func SetRadarMapConfig() {
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index 3e8329dfbb..8f31896616 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -1,22 +1,435 @@
package repo
import (
- "code.gitea.io/gitea/modules/base"
+ "code.gitea.io/gitea/modules/auth"
+ "code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/modelarts"
+ "code.gitea.io/gitea/modules/util"
+ "encoding/json"
+ "io/ioutil"
"net/http"
+ "os"
+ "path"
+ "strconv"
+ "strings"
+ "time"
+ "code.gitea.io/gitea/models"
+ "code.gitea.io/gitea/modules/base"
+ "code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/context"
+ "code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/setting"
)
const (
- tplGrampusTrainJobNew base.TplName = "repo/grampus/trainjob/new"
- tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
+ //GPU
+ tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
+ tplGrampusTrainJobGPUShow base.TplName = "repo/grampus/trainjob/gpu/show"
+
+ //NPU
+ tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
+ tplGrampusTrainJobNPUShow base.TplName = "repo/grampus/trainjob/npu/show"
)
-func GrampusNew(ctx *context.Context) {
- err := cloudBrainNewDataPrepare(ctx)
+func GrampusTrainJobGPUNew(ctx *context.Context) {
+ err := grampusGpuNewDataPrepare(ctx)
+ if err != nil {
+ ctx.ServerError("get new train-job info failed", err)
+ return
+ }
+ ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
+}
+
+func grampusGpuNewDataPrepare(ctx *context.Context) error {
+ ctx.Data["PageIsCloudBrain"] = true
+ t := time.Now()
+ var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
+ ctx.Data["display_job_name"] = displayJobName
+
+ //get valid images
+ result, err := cloudbrain.GetImages()
+ if err != nil {
+ ctx.Data["error"] = err.Error()
+ log.Error("cloudbrain.GetImages failed:", err.Error(), ctx.Data["MsgID"])
+ }
+
+ for i, payload := range result.Payload.ImageInfo {
+ if strings.HasPrefix(result.Payload.ImageInfo[i].Place, "192.168") {
+ result.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)]
+ } else {
+ result.Payload.ImageInfo[i].PlaceView = payload.Place
+ }
+ }
+
+ ctx.Data["images"] = result.Payload.ImageInfo
+
+ resultPublic, err := cloudbrain.GetPublicImages()
+ if err != nil {
+ ctx.Data["error"] = err.Error()
+ log.Error("cloudbrain.GetPublicImages failed:", err.Error(), ctx.Data["MsgID"])
+ }
+
+ for i, payload := range resultPublic.Payload.ImageInfo {
+ if strings.HasPrefix(resultPublic.Payload.ImageInfo[i].Place, "192.168") {
+ resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place[strings.Index(payload.Place, "/"):len(payload.Place)]
+ } else {
+ resultPublic.Payload.ImageInfo[i].PlaceView = payload.Place
+ }
+ }
+
+ ctx.Data["public_images"] = resultPublic.Payload.ImageInfo
+
+ //get valid dataset
+ attachs, err := models.GetAllUserAttachments(ctx.User.ID)
+ if err != nil {
+ log.Error("GetAllUserAttachments failed: %v", err, ctx.Data["MsgID"])
+ return err
+ }
+
+ ctx.Data["attachments"] = attachs
+ ctx.Data["command"] = cloudbrain.Command
+ ctx.Data["code_path"] = cloudbrain.CodeMountPath
+ ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
+ ctx.Data["model_path"] = cloudbrain.ModelMountPath
+ ctx.Data["benchmark_path"] = cloudbrain.BenchMarkMountPath
+ ctx.Data["is_benchmark_enabled"] = setting.IsBenchmarkEnabled
+
+ //get valid resource specs
+ if categories == nil {
+ json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
+ }
+ ctx.Data["benchmark_categories"] = categories.Category
+
+ ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
+
+ if gpuInfos == nil {
+ json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
+ }
+ ctx.Data["gpu_types"] = gpuInfos.GpuInfo
+
+ if trainGpuInfos == nil {
+ json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos)
+ }
+ ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo
+
+ if benchmarkGpuInfos == nil {
+ json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
+ }
+ ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo
+
+ if benchmarkResourceSpecs == nil {
+ json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs)
+ }
+ ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec
+
+ if cloudbrain.ResourceSpecs == nil {
+ json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
+ }
+ ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec
+
+ if cloudbrain.TrainResourceSpecs == nil {
+ json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
+ }
+ ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec
+ ctx.Data["params"] = ""
+ ctx.Data["branchName"] = ctx.Repo.BranchName
+
+ ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
+ ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled
+
+ ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
+ ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled
+
+ ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne
+
+ ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
+
+ return nil
+}
+
+func GrampusTrainJobNPUNew(ctx *context.Context) {
+ err := trainJobNpuNewDataPrepare(ctx)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}
- ctx.HTML(http.StatusOK, tplGrampusTrainJobNew)
+ ctx.HTML(200, tplGrampusTrainJobNPUNew)
+}
+
+func trainJobNpuNewDataPrepare(ctx *context.Context) error {
+ ctx.Data["PageIsCloudBrain"] = true
+
+ t := time.Now()
+ var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
+ ctx.Data["display_job_name"] = displayJobName
+
+ //get valid dataset
+ attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
+ if err != nil {
+ ctx.ServerError("GetAllUserAttachments failed:", err)
+ return err
+ }
+ ctx.Data["attachments"] = attachs
+
+ //get valid resource specs
+ var resourcePools modelarts.ResourcePool
+ if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
+ ctx.ServerError("json.Unmarshal failed:", err)
+ return err
+ }
+ ctx.Data["resource_pools"] = resourcePools.Info
+
+ var engines modelarts.Engine
+ if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
+ ctx.ServerError("json.Unmarshal failed:", err)
+ return err
+ }
+ ctx.Data["engines"] = engines.Info
+
+ var versionInfos modelarts.VersionInfo
+ if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
+ ctx.ServerError("json.Unmarshal failed:", err)
+ return err
+ }
+ ctx.Data["engine_versions"] = versionInfos.Version
+
+ var flavorInfos modelarts.Flavor
+ if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
+ ctx.ServerError("json.Unmarshal failed:", err)
+ return err
+ }
+ ctx.Data["flavor_infos"] = flavorInfos.Info
+
+ ctx.Data["params"] = ""
+ ctx.Data["branchName"] = ctx.Repo.BranchName
+
+ configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
+ if err != nil {
+ ctx.ServerError("getConfigList failed:", err)
+ return err
+ }
+ ctx.Data["config_list"] = configList.ParaConfigs
+ ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
+
+ return nil
+}
+
+func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
+ VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
+ displayJobName := form.DisplayJobName
+ jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
+ uuid := form.Attachment
+ description := form.Description
+ workServerNumber := form.WorkServerNumber
+ engineID := form.EngineID
+ bootFile := form.BootFile
+ flavorCode := form.Flavor
+ params := form.Params
+ poolID := form.PoolID
+ isSaveParam := form.IsSaveParam
+ repo := ctx.Repo.Repository
+ codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
+ codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
+ outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
+ logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
+ dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
+ branch_name := form.BranchName
+ isLatestVersion := modelarts.IsLatestVersion
+ FlavorName := form.FlavorName
+ VersionCount := modelarts.VersionCount
+ EngineName := form.EngineName
+
+ count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
+ if err != nil {
+ log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
+ return
+ } else {
+ if count >= 1 {
+ log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
+ return
+ }
+ }
+
+ if err := paramCheckCreateTrainJob(form); err != nil {
+ log.Error("paramCheckCreateTrainJob failed:(%v)", err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
+ return
+ }
+ //Determine whether the task name of the task in the project is duplicated
+ tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
+ if err == nil {
+ if len(tasks) != 0 {
+ log.Error("the job name did already exist", ctx.Data["MsgID"])
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form)
+ return
+ }
+ } else {
+ if !models.IsErrJobNotExist(err) {
+ log.Error("system error, %v", err, ctx.Data["MsgID"])
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
+ return
+ }
+ }
+
+ //todo: del the codeLocalPath
+ _, err = ioutil.ReadDir(codeLocalPath)
+ if err == nil {
+ os.RemoveAll(codeLocalPath)
+ }
+
+ gitRepo, _ := git.OpenRepository(repo.RepoPath())
+ commitID, _ := gitRepo.GetBranchCommitID(branch_name)
+
+ if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
+ log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form)
+ return
+ }
+
+ //todo: upload code (send to file_server todo this work?)
+ if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
+ log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
+ return
+ }
+
+ if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
+ log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
+ return
+ }
+
+ // parentDir := VersionOutputPath + "/"
+ if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
+ // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
+ log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
+ return
+ }
+
+ var parameters models.Parameters
+ param := make([]models.Parameter, 0)
+ existDeviceTarget := false
+ if len(params) != 0 {
+ err := json.Unmarshal([]byte(params), ¶meters)
+ if err != nil {
+ log.Error("Failed to Unmarshal params: %s (%v)", params, err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
+ return
+ }
+
+ for _, parameter := range parameters.Parameter {
+ if parameter.Label == modelarts.DeviceTarget {
+ existDeviceTarget = true
+ }
+ if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
+ param = append(param, models.Parameter{
+ Label: parameter.Label,
+ Value: parameter.Value,
+ })
+ }
+ }
+ }
+ if !existDeviceTarget {
+ param = append(param, models.Parameter{
+ Label: modelarts.DeviceTarget,
+ Value: modelarts.Ascend,
+ })
+ }
+
+ //save param config
+ if isSaveParam == "on" {
+ saveparams := append(param, models.Parameter{
+ Label: modelarts.TrainUrl,
+ Value: outputObsPath,
+ }, models.Parameter{
+ Label: modelarts.DataUrl,
+ Value: dataPath,
+ })
+ if form.ParameterTemplateName == "" {
+ log.Error("ParameterTemplateName is empty")
+ trainJobNewDataPrepare(ctx)
+ ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
+ return
+ }
+
+ _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
+ ConfigName: form.ParameterTemplateName,
+ Description: form.PrameterDescription,
+ DataUrl: dataPath,
+ AppUrl: codeObsPath,
+ BootFileUrl: codeObsPath + bootFile,
+ TrainUrl: outputObsPath,
+ Flavor: models.Flavor{
+ Code: flavorCode,
+ },
+ WorkServerNum: workServerNumber,
+ EngineID: int64(engineID),
+ LogUrl: logObsPath,
+ PoolID: poolID,
+ Parameter: saveparams,
+ })
+
+ if err != nil {
+ log.Error("Failed to CreateTrainJobConfig: %v", err)
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
+ return
+ }
+ }
+
+ req := &modelarts.GenerateTrainJobReq{
+ JobName: jobName,
+ DisplayJobName: displayJobName,
+ DataUrl: dataPath,
+ Description: description,
+ CodeObsPath: codeObsPath,
+ BootFileUrl: codeObsPath + bootFile,
+ BootFile: bootFile,
+ TrainUrl: outputObsPath,
+ FlavorCode: flavorCode,
+ WorkServerNumber: workServerNumber,
+ EngineID: int64(engineID),
+ LogUrl: logObsPath,
+ PoolID: poolID,
+ Uuid: uuid,
+ Parameters: param,
+ CommitID: commitID,
+ IsLatestVersion: isLatestVersion,
+ BranchName: branch_name,
+ Params: form.Params,
+ FlavorName: FlavorName,
+ EngineName: EngineName,
+ VersionCount: VersionCount,
+ TotalVersionCount: modelarts.TotalVersionCount,
+ }
+
+ //将params转换Parameters.Parameter,出错时返回给前端
+ var Parameters modelarts.Parameters
+ if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
+ ctx.ServerError("json.Unmarshal failed:", err)
+ return
+ }
+
+ err = modelarts.GenerateTrainJob(ctx, req)
+ if err != nil {
+ log.Error("GenerateTrainJob failed:%v", err.Error())
+ trainJobErrorNewDataPrepare(ctx, form)
+ ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
+ return
+ }
+ ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}
diff --git a/routers/routes/routes.go b/routers/routes/routes.go
index a64eb0fae8..ab3e2dd559 100755
--- a/routers/routes/routes.go
+++ b/routers/routes/routes.go
@@ -1085,14 +1085,26 @@ func RegisterRoutes(m *macaron.Macaron) {
}, context.RepoRef())
m.Group("/grampus", func() {
m.Group("/train-job", func() {
- m.Group("/:jobid", func() {
- m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow)
- m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
- m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel)
- m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
+ m.Group("/gpu", func() {
+ m.Group("/:jobid", func() {
+ m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow)
+ m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
+ m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel)
+ m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
+ })
+ m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew)
+ //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate)
+ })
+ m.Group("/npu", func() {
+ m.Group("/:jobid", func() {
+ m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow)
+ m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
+ m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel)
+ m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
+ })
+ m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew)
+ //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate)
})
- m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusNew)
- m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusCreate)
})
}, context.RepoRef())
m.Group("/modelmanage", func() {
diff --git a/templates/repo/grampus/trainjob/gpu/new.tmpl b/templates/repo/grampus/trainjob/gpu/new.tmpl
new file mode 100755
index 0000000000..7c42eba756
--- /dev/null
+++ b/templates/repo/grampus/trainjob/gpu/new.tmpl
@@ -0,0 +1,447 @@
+{{template "base/head" .}}
+
+
+
+
+ {{template "repo/header" .}}
+
+ {{template "base/alert" .}}
+
+
+
+
+{{template "base/footer" .}}
+
+
\ No newline at end of file
diff --git a/templates/repo/grampus/trainjob/gpu/show.tmpl b/templates/repo/grampus/trainjob/gpu/show.tmpl
new file mode 100755
index 0000000000..f1087abcfb
--- /dev/null
+++ b/templates/repo/grampus/trainjob/gpu/show.tmpl
@@ -0,0 +1,731 @@
+{{template "base/head" .}}
+
+
+
+ {{template "repo/header" .}}
+
+
+ {{range $k ,$v := .version_list_task}}
+
+
+
+
+
+
+
+
+
+ {{TimeSinceUnix1 .CreatedUnix}}
+
+ {{$.i18n.Tr "repo.modelarts.status"}}:
+ {{.Status}}
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}:
+ {{$.duration}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{.VersionName}}
+
/
+
+
+
+
+
+
+
+
+
+
+ {{end}} {{template "base/paginate" .}}
+
+
+
+
+
+
+
+
你确认删除该任务么?此任务一旦删除不可恢复。
+
+
+
+
+
+
+
+{{template "base/footer" .}}
+
+
\ No newline at end of file
diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl
new file mode 100755
index 0000000000..6f5f5455f0
--- /dev/null
+++ b/templates/repo/grampus/trainjob/npu/new.tmpl
@@ -0,0 +1,475 @@
+{{template "base/head" .}}
+
+
+
+
+ {{template "repo/header" .}}
+
+ {{template "base/alert" .}}
+
+
+
+
+ {{.CsrfTokenHtml}}
+
+
+
+
+
+ {{.i18n.Tr "cloudbrain.compute_resource"}}
+
+
+
+ {{.i18n.Tr "repo.modelarts.train_job.job_name"}}
+
+ {{.i18n.Tr "cloudbrain.job_name_rule"}}
+
+
+
+ {{.i18n.Tr "repo.modelarts.train_job.description"}}
+
+
+
+
+
+
+
+
+ {{.i18n.Tr "repo.modelarts.code_version"}}
+
+ {{if .branch_name}}
+ {{.branch_name}}
+ {{range $k, $v :=.Branches}}
+ {{ if ne $v $.branch_name }}
+ {{$v}}
+ {{end}}
+ {{end}}
+ {{else}}
+ {{.branchName}}
+ {{range $k, $v :=.Branches}}
+ {{ if ne $v $.branchName }}
+ {{$v}}
+ {{end}}
+ {{end}}
+ {{end}}
+
+
+
+
+
+
+
{{.i18n.Tr "repo.modelarts.train_job.AI_driver"}}
+
+
+ {{range .engines}}
+ {{.Value}}
+ {{end}}
+
+
+
+
+
+ {{range .engine_versions}}
+ {{.Value}}
+ {{end}}
+
+
+
+
+
+
+
+
+ {{template "custom/select_dataset_train" .}}
+ {{.i18n.Tr "cloudbrain.dataset_path_rule"}}
+
+
{{.i18n.Tr "repo.modelarts.train_job.run_parameter"}}
+
{{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
+
+
+ {{if ne 0 (len .params)}}
+ {{range $k ,$v := .params}}
+
+ {{end}}
+ {{end}}
+
+
+
+
+
+ {{.i18n.Tr "repo.modelarts.train_job.resource_pool"}}
+
+ {{range .resource_pools}}
+ {{.Value}}
+ {{end}}
+
+
+
+
+
{{.i18n.Tr "repo.modelarts.train_job.resource_type"}}
+
+
+
+
train-private-1
+
{{svg "octicon-verified" 16}} 运行中
+
CPU:192 核 2048GiB
+
+
+
+
+
+ {{.i18n.Tr "repo.modelarts.train_job.standard"}}
+
+ {{range .flavor_infos}}
+ {{.Value}}
+ {{end}}
+
+
+
+
{{.i18n.Tr "repo.modelarts.train_job.amount_of_compute_node"}}
+
+
+
+
+
+
+
+
+
+
+
+
+{{template "base/footer" .}}
+
+
diff --git a/templates/repo/grampus/trainjob/npu/show.tmpl b/templates/repo/grampus/trainjob/npu/show.tmpl
new file mode 100755
index 0000000000..8f168fcf9f
--- /dev/null
+++ b/templates/repo/grampus/trainjob/npu/show.tmpl
@@ -0,0 +1,1008 @@
+{{template "base/head" .}}
+
+
+
+ {{template "repo/header" .}}
+
+
+ {{range $k ,$v := .version_list_task}}
+
+
+
+
+
+
+
+
+ {{$.CsrfTokenHtml}}
+ {{if and (.CanModify) (eq .Status "COMPLETED") ($.Permission.CanWrite $.UnitTypeModelManage) }}
+
+ {{else}}
+
+ {{end}}
+
+ {{if .CanModify}}
+
+ {{else}}
+
+ {{end}}
+
+ {{if .CanDel}}
+
+ {{else}}
+
+ {{end}}
+
+
+ {{if .CanDel}}
+
+ {{else}}
+
+ {{end}}
+
+
+
+
+ {{if not (eq .Cloudbrain.StartTime 0)}}
+ {{TimeSinceUnix1 .Cloudbrain.StartTime}}
+ {{else}}
+ {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}}
+ {{end}}
+
+ {{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}}
+
+ {{$.i18n.Tr "repo.modelarts.parent_version"}}:{{.PreVersionName}}
+ {{$.i18n.Tr "repo.modelarts.status"}}:
+ {{.Status}}
+
+ {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}:
+ {{.TrainJobDuration}}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
{{.VersionName}}
+
/
+
+
+
+
+
+
+
+
+
+
+ {{end}} {{template "base/paginate" .}}
+
+
+
+
+
+
+
+
{{.i18n.Tr "cloudbrain.task_delete_confirm"}}
+
+
+
+ {{.i18n.Tr "cloudbrain.operate_cancel"}}
+
+
+ {{.i18n.Tr "cloudbrain.operate_confirm"}}
+
+
+
+
+
+
+
+
+
+
+
+
+ {{$.CsrfTokenHtml}}
+
+
+
+
+
+ 模型名称
+
+
+
+ 模型版本
+
+
+
+ 模型标签
+
+
+
+ 模型描述
+
+
+
+
+
+ {{.i18n.Tr "repo.model.manage.sava_model"}}
+
+
+
+
+ {{.i18n.Tr "repo.cloudbrain.cancel"}}
+
+
+
+
+
+
+
+{{template "base/footer" .}}
+
+
\ No newline at end of file
--
2.34.1
From bdc509391ff11aa011717a30e29029e672487a76 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Mon, 23 May 2022 17:57:13 +0800
Subject: [PATCH 03/56] create job
---
models/cloudbrain.go | 59 ++++
modules/auth/grampus.go | 23 ++
modules/cloudbrain/cloudbrain.go | 2 -
modules/grampus/grampus.go | 246 +++-----------
modules/grampus/resty.go | 533 +------------------------------
routers/repo/grampus.go | 185 +++++------
routers/repo/modelarts.go | 19 +-
routers/routes/routes.go | 2 +-
8 files changed, 234 insertions(+), 835 deletions(-)
create mode 100755 modules/auth/grampus.go
diff --git a/models/cloudbrain.go b/models/cloudbrain.go
index e28ba3ea5c..f775626ada 100755
--- a/models/cloudbrain.go
+++ b/models/cloudbrain.go
@@ -24,6 +24,7 @@ type ModelArtsJobStatus string
const (
TypeCloudBrainOne int = iota
TypeCloudBrainTwo
+ TypeCloudBrainGrampus
TypeCloudBrainAll = -1
)
@@ -98,6 +99,14 @@ const (
ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败
DURATION_STR_ZERO = "00:00:00"
+
+ //grampus
+ GrampusStatusPending = "pending"
+ GrampusStatusRunning = "running"
+ GrampusStatusFailed = "failed"
+ GrampusStatusSucceeded = "succeeded"
+ GrampusStatusStopped = "stopped"
+ GrampusStatusUnknown = "unknown"
)
type Cloudbrain struct {
@@ -328,6 +337,7 @@ type CloudbrainsOptions struct {
JobTypeNot bool
NeedRepoInfo bool
RepoIDList []int64
+ ComputeResource string
}
type TaskPod struct {
@@ -1150,6 +1160,44 @@ type LogFile struct {
Name string
}
+//Grampus
+type GrampusResult struct {
+ ErrorCode int `json:"errorCode"`
+ ErrorMsg string `json:"errorMsg"`
+}
+
+type GrampusJobInfo struct {
+ StartedAt int64 `json:"startedAt"`
+ RunSec int64 `json:"runSec"`
+ CompletedAt int64 `json:"completedAt"`
+ CreatedAt int64 `json:"createdAt"`
+ UpdatedAt int64 `json:"updatedAt"`
+ Desc string `json:"desc"`
+ JobID string `json:"id"`
+ Name string `json:"name"`
+ Status string `json:"status"`
+ UserID string `json:"userId"`
+ Tasks []GrampusTasks `json:"tasks"`
+}
+
+type CreateGrampusJobResponse struct {
+ GrampusResult
+ JobInfo GrampusJobInfo `json:"otJob"`
+}
+
+type GrampusTasks struct {
+ Command string `json:"command"`
+ Name string `json:"name"`
+ ImageId string `json:"imageId"`
+ ResourceSpecId string `json:"resourceSpecId"`
+ ImageUrl string `json:"imageUrl"`
+}
+
+type CreateGrampusJobRequest struct {
+ Name string `json:"name"`
+ Tasks []GrampusTasks `json:"tasks"`
+}
+
func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()
@@ -1179,6 +1227,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
)
}
+ if len(opts.ComputeResource) >= 0 {
+ cond = cond.And(
+ builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource},
+ )
+ }
+
if len(opts.JobTypes) > 0 {
if opts.JobTypeNot {
cond = cond.And(
@@ -1589,6 +1643,11 @@ func GetCloudbrainInferenceJobCountByUserID(userID int64) (int, error) {
return int(count), err
}
+func GetGrampusCountByUserID(userID int64, jobType, computeResource string) (int, error) {
+ count, err := x.In("status", GrampusStatusPending, GrampusStatusRunning).And("job_type = ? and user_id = ? and type = ?", jobType, userID, TypeCloudBrainGrampus).And("compute_resource = ?", computeResource).Count(new(Cloudbrain))
+ return int(count), err
+}
+
func UpdateInferenceJob(job *Cloudbrain) error {
return updateInferenceJob(x, job)
}
diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go
new file mode 100755
index 0000000000..2cfaf70061
--- /dev/null
+++ b/modules/auth/grampus.go
@@ -0,0 +1,23 @@
+package auth
+
+import (
+ "gitea.com/macaron/binding"
+ "gitea.com/macaron/macaron"
+)
+
+type CreateGrampusTrainJobForm struct {
+ DisplayJobName string `form:"display_job_name" binding:"Required"`
+ JobName string `form:"job_name" binding:"Required"`
+ Attachment string `form:"attachment" binding:"Required"`
+ BootFile string `form:"boot_file" binding:"Required"`
+ Flavor string `form:"flavor" binding:"Required"`
+ Params string `form:"run_para_list" binding:"Required"`
+ Description string `form:"description"`
+ BranchName string `form:"branch_name" binding:"Required"`
+ FlavorName string `form:"flaver_names" binding:"Required"`
+ EngineName string `form:"engine_names" binding:"Required"`
+}
+
+func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
+ return validate(errs, ctx.Data, f, ctx.Locale)
+}
diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go
index a71389741d..dc1d0e4609 100755
--- a/modules/cloudbrain/cloudbrain.go
+++ b/modules/cloudbrain/cloudbrain.go
@@ -48,8 +48,6 @@ func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, er
if !ctx.IsSigned {
return false
}
- log.Info("is repo owner:" + strconv.FormatBool(ctx.IsUserRepoOwner()))
- log.Info("is user admin:" + strconv.FormatBool(ctx.IsUserSiteAdmin()))
if err != nil {
return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index b60afb5cce..26e143429f 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -1,13 +1,11 @@
package grampus
import (
- "code.gitea.io/gitea/modules/timeutil"
- "strconv"
-
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification"
+ "code.gitea.io/gitea/modules/timeutil"
)
const (
@@ -21,19 +19,6 @@ const (
NotebookType = "Ascend"
FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
- //train-job
- // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
- // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
- // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
- // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
- // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
- // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
- // "]}"
- // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
- // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
- // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
- // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
- // "]}"
CodePath = "/code/"
OutputPath = "/output/"
ResultPath = "/result/"
@@ -65,7 +50,12 @@ var (
)
type GenerateTrainJobReq struct {
- JobName string
+ JobName string
+ Command string
+ ResourceSpecId string
+ ImageUrl string
+ ImageId string
+
DisplayJobName string
Uuid string
Description string
@@ -74,15 +64,10 @@ type GenerateTrainJobReq struct {
BootFileUrl string
DataUrl string
TrainUrl string
- FlavorCode string
- LogUrl string
- PoolID string
WorkServerNumber int
EngineID int64
- Parameters []models.Parameter
CommitID string
IsLatestVersion string
- Params string
BranchName string
PreVersionId int64
PreVersionName string
@@ -90,139 +75,54 @@ type GenerateTrainJobReq struct {
VersionCount int
EngineName string
TotalVersionCount int
-}
-
-type GenerateInferenceJobReq struct {
- JobName string
- DisplayJobName string
- Uuid string
- Description string
- CodeObsPath string
- BootFile string
- BootFileUrl string
- DataUrl string
- TrainUrl string
- FlavorCode string
- LogUrl string
- PoolID string
- WorkServerNumber int
- EngineID int64
- Parameters []models.Parameter
- CommitID string
- Params string
- BranchName string
- FlavorName string
- EngineName string
- LabelName string
- IsLatestVersion string
- VersionCount int
- TotalVersionCount int
- ModelName string
- ModelVersion string
- CkptName string
- ResultUrl string
-}
-
-type VersionInfo struct {
- Version []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- } `json:"version"`
-}
-
-type Flavor struct {
- Info []struct {
- Code string `json:"code"`
- Value string `json:"value"`
- } `json:"flavor"`
-}
-
-type Engine struct {
- Info []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- } `json:"engine"`
-}
-
-type ResourcePool struct {
- Info []struct {
- ID string `json:"id"`
- Value string `json:"value"`
- } `json:"resource_pool"`
-}
-
-// type Parameter struct {
-// Label string `json:"label"`
-// Value string `json:"value"`
-// }
-
-// type Parameters struct {
-// Parameter []Parameter `json:"parameter"`
-// }
-
-type Parameters struct {
- Parameter []struct {
- Label string `json:"label"`
- Value string `json:"value"`
- } `json:"parameter"`
+ ComputeResource string
+ DatasetName string
}
func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
createTime := timeutil.TimeStampNow()
- jobResult, err := createTrainJob(models.CreateTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.Config{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
+ jobResult, err := createJob(models.CreateGrampusJobRequest{
+ Name: req.JobName,
+ Tasks: []models.GrampusTasks{
+ {
+ Name: req.JobName,
+ Command: req.Command,
+ ResourceSpecId: req.ResourceSpecId,
+ ImageId: req.ImageId,
+ ImageUrl: req.ImageUrl,
},
- Parameter: req.Parameters,
},
})
if err != nil {
- log.Error("CreateJob failed: %v", err.Error())
+ log.Error("createJob failed: %v", err.Error())
return err
}
- attach, err := models.GetAttachmentByUUID(req.Uuid)
- if err != nil {
- log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
- return err
- }
- jobId := strconv.FormatInt(jobResult.JobID, 10)
+ jobID := jobResult.JobInfo.JobID
err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: TransTrainJobStatus(jobResult.Status),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobId,
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeTrain),
- Type: models.TypeCloudBrainTwo,
- VersionID: jobResult.VersionID,
- VersionName: jobResult.VersionName,
- Uuid: req.Uuid,
- DatasetName: attach.Name,
- CommitID: req.CommitID,
- IsLatestVersion: req.IsLatestVersion,
- ComputeResource: models.NPUResource,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- FlavorCode: req.FlavorCode,
+ Status: string(models.GrampusStatusPending),
+ UserID: ctx.User.ID,
+ RepoID: ctx.Repo.Repository.ID,
+ JobID: jobID,
+ JobName: req.JobName,
+ DisplayJobName: req.DisplayJobName,
+ JobType: string(models.JobTypeTrain),
+ Type: models.TypeCloudBrainGrampus,
+ //VersionID: jobResult.VersionID,
+ //VersionName: jobResult.VersionName,
+ Uuid: req.Uuid,
+ DatasetName: req.DatasetName,
+ CommitID: req.CommitID,
+ //IsLatestVersion: req.IsLatestVersion,
+ ComputeResource: req.ComputeResource,
+ //EngineID: req.EngineID,
+ TrainUrl: req.TrainUrl,
+ BranchName: req.BranchName,
+ //Parameters: req.Params,
+ BootFile: req.BootFile,
+ DataUrl: req.DataUrl,
+ //LogUrl: req.LogUrl,
+ //FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
@@ -237,58 +137,14 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
return err
}
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
- return nil
-}
-
-func TransTrainJobStatus(status int) string {
- switch status {
- case 0:
- return "UNKNOWN"
- case 1:
- return "INIT"
- case 2:
- return "IMAGE_CREATING"
- case 3:
- return "IMAGE_FAILED"
- case 4:
- return "SUBMIT_TRYING"
- case 5:
- return "SUBMIT_FAILED"
- case 6:
- return "DELETE_FAILED"
- case 7:
- return "WAITING"
- case 8:
- return "RUNNING"
- case 9:
- return "KILLING"
- case 10:
- return "COMPLETED"
- case 11:
- return "FAILED"
- case 12:
- return "KILLED"
- case 13:
- return "CANCELED"
- case 14:
- return "LOST"
- case 15:
- return "SCALING"
- case 16:
- return "SUBMIT_MODEL_FAILED"
- case 17:
- return "DEPLOY_SERVICE_FAILED"
- case 18:
- return "CHECK_INIT"
- case 19:
- return "CHECK_RUNNING"
- case 20:
- return "CHECK_RUNNING_COMPLETED"
- case 21:
- return "CHECK_FAILED"
- default:
- return strconv.Itoa(status)
+ var actionType models.ActionType
+ if req.ComputeResource == models.NPUResource {
+ actionType = models.ActionCreateTrainTask
+ } else if req.ComputeResource == models.GPUResource {
+ actionType = models.ActionCreateGPUTrainTask
}
+ notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
+
+ return nil
}
diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go
index 693ba71a1d..f64c98b554 100755
--- a/modules/grampus/resty.go
+++ b/modules/grampus/resty.go
@@ -23,19 +23,18 @@ const (
urlOpenApiV1 = "/openapi/v1/"
urlGetToken = urlOpenApiV1 + "token"
- urlNotebook = "/demanager/instances"
- urlTrainJob = "/training-jobs"
+ urlTrainJob = urlOpenApiV1 + "trainjob"
urlResourceSpecs = "/job/resource-specs"
urlTrainJobConfig = "/training-job-configs"
errorCodeExceedLimit = "ModelArts.0118"
urlNotebook2 = ""
- modelartsIllegalToken = ""
+ errorIllegalToken = 1005
)
type GetTokenParams struct {
- UserName string `json:"user_name"`
+ UserName string `json:"username"`
Password string `json:"password"`
}
@@ -92,44 +91,34 @@ func getToken() error {
return nil
}
-func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) {
+func createJob(req models.CreateGrampusJobRequest) (*models.CreateGrampusJobResponse, error) {
checkSetting()
client := getRestyClient()
- var result models.CreateNotebookResult
+ var result models.CreateGrampusJobResponse
retry := 0
sendjob:
- res, err := client.R().
+ _, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
- SetBody(createJobParams).
+ SetBody(req).
SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlNotebook)
+ Post(HOST + urlTrainJob)
if err != nil {
- return nil, fmt.Errorf("resty create notebook: %s", err)
+ return nil, fmt.Errorf("resty CreateJob: %s", err)
}
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- if response.ErrorCode == errorCodeExceedLimit {
- response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
- }
- return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if result.ErrorCode != 0 {
+ log.Error("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("CreateJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}
return &result, nil
@@ -147,7 +136,7 @@ sendjob:
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)
+ Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)
if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
@@ -174,217 +163,6 @@ sendjob:
return &result, nil
}
-func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) {
- checkSetting()
- client := getRestyClient()
- var result models.GetNotebook2Result
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)
-
- if err != nil {
- return nil, fmt.Errorf("resty GetJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- if response.ErrorCode == modelartsIllegalToken && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
- return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.NotebookActionResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetBody(param).
- SetAuthToken(TOKEN).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action")
-
- if err != nil {
- return &result, fmt.Errorf("resty StopJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.NotebookActionResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(autoStopDurationMs))
-
- if err != nil {
- return &result, fmt.Errorf("resty ManageNotebook2: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- if response.ErrorCode == modelartsIllegalToken && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
- return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func DelNotebook(jobID string) (*models.NotebookDelResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.NotebookDelResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetResult(&result).
- Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)
-
- if err != nil {
- return &result, fmt.Errorf("resty DelJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func DelNotebook2(jobID string) (*models.NotebookDelResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.NotebookDelResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetResult(&result).
- Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)
-
- if err != nil {
- return &result, fmt.Errorf("resty DelJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- if response.ErrorCode == modelartsIllegalToken && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
- return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
func DelJob(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
@@ -397,7 +175,7 @@ sendjob:
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
- Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)
+ Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)
if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
@@ -424,45 +202,6 @@ sendjob:
return &result, nil
}
-func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.NotebookGetJobTokenResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token")
-
- if err != nil {
- return &result, fmt.Errorf("resty GetJobToken: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
@@ -519,61 +258,6 @@ sendjob:
return &result, nil
}
-func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.CreateTrainJobResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetBody(createJobVersionParams).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")
-
- if err != nil {
- return nil, fmt.Errorf("resty create train-job version: %s", err)
- }
-
- req, _ := json.Marshal(createJobVersionParams)
- log.Info("%s", req)
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
- DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
- if temp.ErrorMsg == BootFileErrorMsg {
- log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("启动文件错误!")
- }
- if temp.ErrorMsg == DataSetErrorMsg {
- log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("数据集错误!")
- }
- return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- }
-
- return &result, nil
-}
-
func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
@@ -616,145 +300,6 @@ sendjob:
return &result, nil
}
-func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.CreateTrainJobConfigResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetBody(req).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)
-
- if err != nil {
- return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- //temp, _ := json.Marshal(req)
- //log.Info("%s", temp)
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.GetConfigListResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetQueryParams(map[string]string{
- "per_page": strconv.Itoa(perPage),
- "page": strconv.Itoa(page),
- "sortBy": sortBy,
- "order": order,
- "search_content": searchContent,
- "config_type": configType,
- }).
- SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)
-
- if err != nil {
- return nil, fmt.Errorf("resty GetConfigList: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func GetParaConfig(configName, configType string) (models.GetConfigResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.GetConfigResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetQueryParams(map[string]string{
- "config_type": configType,
- }).
- SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName)
-
- if err != nil {
- return result, fmt.Errorf("resty GetParaConfig: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg)
- }
-
- return result, nil
-}
-
func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) {
checkSetting()
client := getRestyClient()
@@ -1062,51 +607,3 @@ sendjob:
return &result, nil
}
-
-func createNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.CreateNotebookResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetBody(createJobParams).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2)
-
- if err != nil {
- return nil, fmt.Errorf("resty create notebook2: %s", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- if response.ErrorCode == errorCodeExceedLimit {
- response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
- }
- if response.ErrorCode == modelartsIllegalToken && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
- return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index 8f31896616..c8c54a9287 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -3,9 +3,11 @@ package repo
import (
"code.gitea.io/gitea/modules/auth"
"code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/modelarts"
"code.gitea.io/gitea/modules/util"
"encoding/json"
+ "errors"
"io/ioutil"
"net/http"
"os"
@@ -149,7 +151,7 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error {
}
func GrampusTrainJobNPUNew(ctx *context.Context) {
- err := trainJobNpuNewDataPrepare(ctx)
+ err := grampusTrainJobNpuNewDataPrepare(ctx)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
@@ -157,7 +159,7 @@ func GrampusTrainJobNPUNew(ctx *context.Context) {
ctx.HTML(200, tplGrampusTrainJobNPUNew)
}
-func trainJobNpuNewDataPrepare(ctx *context.Context) error {
+func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error {
ctx.Data["PageIsCloudBrain"] = true
t := time.Now()
@@ -215,110 +217,122 @@ func trainJobNpuNewDataPrepare(ctx *context.Context) error {
return nil
}
-func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
+func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
+ if !strings.HasSuffix(form.BootFile, ".py") {
+ log.Error("the boot file(%s) must be a python file", form.BootFile)
+ return errors.New("启动文件必须是python文件")
+ }
+
+ if form.BranchName == "" {
+ log.Error("the branch must not be null!", form.BranchName)
+ return errors.New("代码分支不能为空!")
+ }
+
+ return nil
+}
+
+func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
+ //todo:del
+ jobName = displayJobName
uuid := form.Attachment
description := form.Description
- workServerNumber := form.WorkServerNumber
- engineID := form.EngineID
bootFile := form.BootFile
- flavorCode := form.Flavor
params := form.Params
- poolID := form.PoolID
- isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
- logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
- branch_name := form.BranchName
+ branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
FlavorName := form.FlavorName
VersionCount := modelarts.VersionCount
EngineName := form.EngineName
- count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
+ log.Info(jobName)
+
+ count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource)
if err != nil {
- log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
+ log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form)
return
}
}
- if err := paramCheckCreateTrainJob(form); err != nil {
+ if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
return
}
- //Determine whether the task name of the task in the project is duplicated
+ //check whether the task name in the project is duplicated
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form)
return
}
}
- //todo: del the codeLocalPath
+ //prepare code and out path
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
os.RemoveAll(codeLocalPath)
}
gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branch_name)
+ commitID, _ := gitRepo.GetBranchCommitID(branchName)
- if err := downloadCode(repo, codeLocalPath, branch_name); err != nil {
+ if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form)
return
}
//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form)
return
}
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form)
return
}
- // parentDir := VersionOutputPath + "/"
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
// if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form)
return
}
+ //prepare command
+ //todo: download code, download dataset, unzip dataset, exec code, upload model
var parameters models.Parameters
param := make([]models.Parameter, 0)
existDeviceTarget := false
@@ -326,8 +340,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra
err := json.Unmarshal([]byte(params), ¶meters)
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -350,67 +364,32 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra
})
}
- //save param config
- if isSaveParam == "on" {
- saveparams := append(param, models.Parameter{
- Label: modelarts.TrainUrl,
- Value: outputObsPath,
- }, models.Parameter{
- Label: modelarts.DataUrl,
- Value: dataPath,
- })
- if form.ParameterTemplateName == "" {
- log.Error("ParameterTemplateName is empty")
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
- return
- }
-
- _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
- ConfigName: form.ParameterTemplateName,
- Description: form.PrameterDescription,
- DataUrl: dataPath,
- AppUrl: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- TrainUrl: outputObsPath,
- Flavor: models.Flavor{
- Code: flavorCode,
- },
- WorkServerNum: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Parameter: saveparams,
- })
-
- if err != nil {
- log.Error("Failed to CreateTrainJobConfig: %v", err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
- }
-
- req := &modelarts.GenerateTrainJobReq{
- JobName: jobName,
- DisplayJobName: displayJobName,
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- TrainUrl: outputObsPath,
- FlavorCode: flavorCode,
- WorkServerNumber: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Uuid: uuid,
- Parameters: param,
- CommitID: commitID,
- IsLatestVersion: isLatestVersion,
- BranchName: branch_name,
- Params: form.Params,
+ req := &grampus.GenerateTrainJobReq{
+ JobName: jobName,
+ DisplayJobName: displayJobName,
+ ComputeResource: models.NPUResource,
+ Command: "echo \"test\"",
+ ResourceSpecId: "modelarts.kat1.xlarge",
+ ImageUrl: "",
+ ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64",
+
+ DataUrl: dataPath,
+ Description: description,
+ CodeObsPath: codeObsPath,
+ BootFileUrl: codeObsPath + bootFile,
+ BootFile: bootFile,
+ //TrainUrl: outputObsPath,
+ //FlavorCode: flavorCode,
+ WorkServerNumber: 1,
+ //EngineID: int64(engineID),
+ //LogUrl: logObsPath,
+ //PoolID: poolID,
+ Uuid: uuid,
+ //Parameters: param,
+ CommitID: commitID,
+ IsLatestVersion: isLatestVersion,
+ BranchName: branchName,
+ //Params: form.Params,
FlavorName: FlavorName,
EngineName: EngineName,
VersionCount: VersionCount,
@@ -424,11 +403,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateModelArtsTra
return
}
- err = modelarts.GenerateTrainJob(ctx, req)
+ err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go
index 3b4e8738fc..dfb2631a01 100755
--- a/routers/repo/modelarts.go
+++ b/routers/repo/modelarts.go
@@ -559,24 +559,11 @@ func TrainJobIndex(ctx *context.Context) {
}
listType := ctx.Query("listType")
- if len(listType) == 0 {
- listType = models.AllResource
- }
ctx.Data["ListType"] = listType
- typeCloudBrain := models.TypeCloudBrainAll
- if listType == models.GPUResource {
- typeCloudBrain = models.TypeCloudBrainOne
- } else if listType == models.NPUResource {
- typeCloudBrain = models.TypeCloudBrainTwo
- } else if listType == models.AllResource {
- typeCloudBrain = models.TypeCloudBrainAll
+ if listType == models.AllResource {
+ listType = ""
}
- //else {
- // log.Error("listType(%s) error", listType)
- // ctx.ServerError("listType error", errors.New("listType error"))
- // return
- //}
var jobTypes []string
jobTypes = append(jobTypes, string(models.JobTypeTrain))
@@ -586,10 +573,10 @@ func TrainJobIndex(ctx *context.Context) {
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
- Type: typeCloudBrain,
JobTypeNot: false,
JobTypes: jobTypes,
IsLatestVersion: modelarts.IsLatestVersion,
+ ComputeResource: listType,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
diff --git a/routers/routes/routes.go b/routers/routes/routes.go
index ab3e2dd559..6348e4ba8b 100755
--- a/routers/routes/routes.go
+++ b/routers/routes/routes.go
@@ -1103,7 +1103,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew)
- //m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate)
+ m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobNpuCreate)
})
})
}, context.RepoRef())
--
2.34.1
From ad72d9510337e40c9f10fbe4887c234d998da2f8 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Mon, 23 May 2022 20:21:44 +0800
Subject: [PATCH 04/56] get job
---
models/cloudbrain.go | 19 ++++++++++++-------
modules/grampus/grampus.go | 19 ++++++++++++++-----
modules/grampus/resty.go | 24 +++++++++---------------
routers/api/v1/repo/modelarts.go | 26 +++++++++++++++++++++++++-
routers/repo/cloudbrain.go | 25 +++++++++++++++++++++++++
routers/repo/modelarts.go | 1 +
6 files changed, 86 insertions(+), 28 deletions(-)
diff --git a/models/cloudbrain.go b/models/cloudbrain.go
index f775626ada..694e277d41 100755
--- a/models/cloudbrain.go
+++ b/models/cloudbrain.go
@@ -102,11 +102,11 @@ const (
//grampus
GrampusStatusPending = "pending"
- GrampusStatusRunning = "running"
- GrampusStatusFailed = "failed"
- GrampusStatusSucceeded = "succeeded"
- GrampusStatusStopped = "stopped"
- GrampusStatusUnknown = "unknown"
+ GrampusStatusRunning = "RUNNING"
+ GrampusStatusFailed = "FAILED"
+ GrampusStatusSucceeded = "SUCCEEDED"
+ GrampusStatusStopped = "STOPPED"
+ GrampusStatusUnknown = "UNKNOWN"
)
type Cloudbrain struct {
@@ -214,7 +214,7 @@ func ConvertDurationToStr(duration int64) string {
}
func IsTrainJobTerminal(status string) bool {
- return status == string(ModelArtsTrainJobCompleted) || status == string(ModelArtsTrainJobFailed) || status == string(ModelArtsTrainJobKilled)
+ return status == string(ModelArtsTrainJobCompleted) || status == string(ModelArtsTrainJobFailed) || status == string(ModelArtsTrainJobKilled) || status == GrampusStatusFailed || status == GrampusStatusStopped || status == GrampusStatusSucceeded
}
func IsModelArtsDebugJobTerminal(status string) bool {
@@ -1185,6 +1185,11 @@ type CreateGrampusJobResponse struct {
JobInfo GrampusJobInfo `json:"otJob"`
}
+type GetGrampusJobResponse struct {
+ GrampusResult
+ JobInfo GrampusJobInfo `json:"otJob"`
+}
+
type GrampusTasks struct {
Command string `json:"command"`
Name string `json:"name"`
@@ -1227,7 +1232,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
)
}
- if len(opts.ComputeResource) >= 0 {
+ if len(opts.ComputeResource) > 0 {
cond = cond.And(
builder.Eq{"cloudbrain.compute_resource": opts.ComputeResource},
)
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 26e143429f..71e368fa6f 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -6,6 +6,7 @@ import (
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification"
"code.gitea.io/gitea/modules/timeutil"
+ "strings"
)
const (
@@ -100,7 +101,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
jobID := jobResult.JobInfo.JobID
err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: string(models.GrampusStatusPending),
+ Status: TransTrainJobStatus(jobResult.JobInfo.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
@@ -110,10 +111,10 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
Type: models.TypeCloudBrainGrampus,
//VersionID: jobResult.VersionID,
//VersionName: jobResult.VersionName,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- //IsLatestVersion: req.IsLatestVersion,
+ Uuid: req.Uuid,
+ DatasetName: req.DatasetName,
+ CommitID: req.CommitID,
+ IsLatestVersion: req.IsLatestVersion,
ComputeResource: req.ComputeResource,
//EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
@@ -148,3 +149,11 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
return nil
}
+
+func TransTrainJobStatus(status string) string {
+ if status == "pending" {
+ status = "waiting"
+ }
+
+ return strings.ToUpper(status)
+}
diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go
index f64c98b554..6f1ee72d67 100755
--- a/modules/grampus/resty.go
+++ b/modules/grampus/resty.go
@@ -124,40 +124,34 @@ sendjob:
return &result, nil
}
-func GetJob(jobID string) (*models.GetNotebookResult, error) {
+func GetJob(jobID string) (*models.GetGrampusJobResponse, error) {
checkSetting()
client := getRestyClient()
- var result models.GetNotebookResult
+ var result models.GetGrampusJobResponse
retry := 0
sendjob:
- res, err := client.R().
+ _, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)
+ Get(HOST + urlTrainJob + "/" + jobID)
if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
+ log.Info("retry get token")
_ = getToken()
goto sendjob
}
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
+ if result.ErrorCode != 0 {
+ log.Error("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}
return &result, nil
diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go
index 9e4edea032..06e4bea44d 100755
--- a/routers/api/v1/repo/modelarts.go
+++ b/routers/api/v1/repo/modelarts.go
@@ -6,6 +6,7 @@
package repo
import (
+ "code.gitea.io/gitea/modules/grampus"
"net/http"
"strconv"
"strings"
@@ -167,7 +168,7 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
log.Error("UpdateJob failed:", err)
}
}
- } else {
+ } else if job.Type == models.TypeCloudBrainTwo {
result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
if err != nil {
ctx.NotFound(err)
@@ -181,6 +182,29 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
job.Duration = result.Duration / 1000
job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
+ if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
+ job.EndTime = job.StartTime.Add(job.Duration)
+ }
+ job.CorrectCreateUnix()
+ err = models.UpdateTrainJobVersion(job)
+ if err != nil {
+ log.Error("UpdateJob failed:", err)
+ }
+ } else if job.Type == models.TypeCloudBrainGrampus {
+ result, err := grampus.GetJob(jobID)
+ if err != nil {
+ log.Error("GetJob(%s) failed:%v", job.JobName, err)
+ ctx.NotFound(err)
+ return
+ }
+
+ if job.StartTime == 0 && result.JobInfo.StartedAt > 0 {
+ job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000)
+ }
+ job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
+ job.Duration = result.JobInfo.RunSec
+ job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
+
if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
job.EndTime = job.StartTime.Add(job.Duration)
}
diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go
index df27a12c26..b95fe2fe55 100755
--- a/routers/repo/cloudbrain.go
+++ b/routers/repo/cloudbrain.go
@@ -2,6 +2,7 @@ package repo
import (
"bufio"
+ "code.gitea.io/gitea/modules/grampus"
"encoding/json"
"errors"
"fmt"
@@ -1492,7 +1493,31 @@ func SyncCloudbrainStatus() {
} else {
log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType)
}
+ } else if task.Type == models.TypeCloudBrainGrampus {
+ result, err := grampus.GetJob(task.JobID)
+ if err != nil {
+ log.Error("GetTrainJob(%s) failed:%v", task.JobName, err)
+ continue
+ }
+ if result != nil {
+ task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
+ task.Duration = result.JobInfo.RunSec
+ task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
+
+ if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
+ task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000)
+ }
+ if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
+ task.EndTime = task.StartTime.Add(task.Duration)
+ }
+ task.CorrectCreateUnix()
+ err = models.UpdateJob(task)
+ if err != nil {
+ log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
+ continue
+ }
+ }
} else {
log.Error("task.Type(%s) is error:%d", task.JobName, task.Type)
}
diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go
index dfb2631a01..ea4ff3b1d2 100755
--- a/routers/repo/modelarts.go
+++ b/routers/repo/modelarts.go
@@ -577,6 +577,7 @@ func TrainJobIndex(ctx *context.Context) {
JobTypes: jobTypes,
IsLatestVersion: modelarts.IsLatestVersion,
ComputeResource: listType,
+ Type: models.TypeCloudBrainAll,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
--
2.34.1
From 340f25c73df4aab6906a134272f7352be7843b66 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Tue, 24 May 2022 18:10:36 +0800
Subject: [PATCH 05/56] show job
---
models/cloudbrain.go | 10 +
modules/cloudbrain/cloudbrain.go | 1 -
modules/grampus/resty.go | 393 +---------
routers/api/v1/api.go | 9 +
routers/api/v1/repo/modelarts.go | 2 +-
routers/repo/cloudbrain.go | 2 +-
routers/repo/grampus.go | 183 ++++-
routers/repo/modelarts.go | 5 -
routers/routes/routes.go | 18 +-
templates/repo/grampus/trainjob/gpu/show.tmpl | 731 ------------------
.../repo/grampus/trainjob/{npu => }/show.tmpl | 69 +-
templates/repo/modelarts/trainjob/index.tmpl | 6 +-
12 files changed, 245 insertions(+), 1184 deletions(-)
delete mode 100755 templates/repo/grampus/trainjob/gpu/show.tmpl
rename templates/repo/grampus/trainjob/{npu => }/show.tmpl (90%)
diff --git a/models/cloudbrain.go b/models/cloudbrain.go
index 694e277d41..6a6645d6b4 100755
--- a/models/cloudbrain.go
+++ b/models/cloudbrain.go
@@ -1190,6 +1190,11 @@ type GetGrampusJobResponse struct {
JobInfo GrampusJobInfo `json:"otJob"`
}
+type GrampusStopJobResponse struct {
+ GrampusResult
+ StoppedAt int64 `json:"stoppedAt"`
+}
+
type GrampusTasks struct {
Command string `json:"command"`
Name string `json:"name"`
@@ -1487,6 +1492,11 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb)
}
+func GetCloudbrainByJobIDWithDeleted(jobID string) (*Cloudbrain, error) {
+ cb := &Cloudbrain{JobID: jobID}
+ return getRepoCloudBrainWithDeleted(cb)
+}
+
func GetCloudbrainByID(id string) (*Cloudbrain, error) {
idInt64, _ := strconv.ParseInt(id, 10, 64)
cb := &Cloudbrain{ID: idInt64}
diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go
index dc1d0e4609..0e62b71d5c 100755
--- a/modules/cloudbrain/cloudbrain.go
+++ b/modules/cloudbrain/cloudbrain.go
@@ -52,7 +52,6 @@ func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, er
return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin()
} else {
- log.Info("is job creator:" + strconv.FormatBool(ctx.User.ID == job.UserID))
return ctx.IsUserRepoOwner() || ctx.IsUserSiteAdmin() || ctx.User.ID == job.UserID
}
diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go
index 6f1ee72d67..0d87f390d0 100755
--- a/modules/grampus/resty.go
+++ b/modules/grampus/resty.go
@@ -1,16 +1,14 @@
package grampus
import (
- "crypto/tls"
- "encoding/json"
- "fmt"
- "net/http"
- "strconv"
-
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
+ "crypto/tls"
+ "encoding/json"
+ "fmt"
"github.com/go-resty/resty/v2"
+ "net/http"
)
var (
@@ -129,11 +127,11 @@ func GetJob(jobID string) (*models.GetGrampusJobResponse, error) {
client := getRestyClient()
var result models.GetGrampusJobResponse
+ log.Info(jobID, TOKEN)
retry := 0
sendjob:
_, err := client.R().
- SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlTrainJob + "/" + jobID)
@@ -157,101 +155,6 @@ sendjob:
return &result, nil
}
-func DelJob(jobID string) (*models.NotebookDelResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.NotebookDelResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetResult(&result).
- Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)
-
- if err != nil {
- return &result, fmt.Errorf("resty DelJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- var response models.NotebookResult
- err = json.Unmarshal(res.Body(), &response)
- if err != nil {
- log.Error("json.Unmarshal failed: %s", err.Error())
- return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
- }
-
- if len(response.ErrorCode) != 0 {
- log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.CreateTrainJobResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetBody(createJobParams).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)
-
- if err != nil {
- return nil, fmt.Errorf("resty create train-job: %s", err)
- }
-
- req, _ := json.Marshal(createJobParams)
- log.Info("%s", req)
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
- DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
- if temp.ErrorMsg == BootFileErrorMsg {
- log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("启动文件错误!")
- }
- if temp.ErrorMsg == DataSetErrorMsg {
- log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("数据集错误!")
- }
- return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- }
-
- return &result, nil
-}
-
func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
@@ -294,309 +197,63 @@ sendjob:
return &result, nil
}
-func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) {
+func GetTrainJobLog(jobID string) (string, error) {
checkSetting()
client := getRestyClient()
- var result models.GetTrainJobResult
-
- retry := 0
+ var logContent string
-sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)
+ SetResult(&logContent).
+ Get(HOST + urlTrainJob + "/" + jobID + "/log")
if err != nil {
- return nil, fmt.Errorf("resty GetTrainJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
+ return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err)
}
if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
+ var temp models.GrampusResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("GetTrainJob(%s) failed", jobID)
- return &result, fmt.Errorf("获取作业详情失败")
- }
-
- return &result, nil
-}
-
-func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.GetTrainJobLogResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetQueryParams(map[string]string{
- "base_line": baseLine,
- "lines": strconv.Itoa(lines),
- "log_file": logFile,
- "order": order,
- }).
- SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log")
-
- if err != nil {
- return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
+ return logContent, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("GetTrainJobLog(%s) failed", jobID)
- return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.GetTrainJobLogFileNamesResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetAuthToken(TOKEN).
- SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names")
-
- if err != nil {
- return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ return logContent, fmt.Errorf("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
- if !result.IsSuccess {
- log.Error("GetTrainJobLogFileNames(%s) failed", jobID)
- return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg)
- }
+ logContent = res.String()
- return &result, nil
+ return logContent, nil
}
-func DelTrainJob(jobID string) (*models.TrainJobResult, error) {
+func StopJob(jobID string) (*models.GrampusStopJobResponse, error) {
checkSetting()
client := getRestyClient()
- var result models.TrainJobResult
+ var result models.GrampusStopJobResponse
retry := 0
sendjob:
- res, err := client.R().
- SetAuthToken(TOKEN).
- SetResult(&result).
- Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)
-
- if err != nil {
- return &result, fmt.Errorf("resty DelTrainJob: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("DelTrainJob(%s) failed", jobID)
- return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.TrainJobResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
+ _, err := client.R().
+ //SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop")
+ Post(HOST + urlTrainJob + "/" + jobID + "/stop")
if err != nil {
return &result, fmt.Errorf("resty StopTrainJob: %v", err)
}
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("StopTrainJob(%s) failed", jobID)
- return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.TrainJobResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetAuthToken(TOKEN).
- SetResult(&result).
- Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)
-
- if err != nil {
- return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err)
- }
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
- retry++
- _ = getToken()
- goto sendjob
- }
-
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("DelTrainJob(%s) failed", jobID)
- return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg)
- }
-
- return &result, nil
-}
-
-func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) {
- checkSetting()
- client := getRestyClient()
- var result models.CreateTrainJobResult
-
- retry := 0
-
-sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
- SetAuthToken(TOKEN).
- SetBody(createJobParams).
- SetResult(&result).
- Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)
-
- if err != nil {
- return nil, fmt.Errorf("resty create inference-job: %s", err)
- }
-
- req, _ := json.Marshal(createJobParams)
- log.Info("%s", req)
-
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
+ log.Info("retry get token")
_ = getToken()
goto sendjob
}
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'."
- DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'."
- if temp.ErrorMsg == BootFileErrorMsg {
- log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("启动文件错误!")
- }
- if temp.ErrorMsg == DataSetErrorMsg {
- log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("数据集错误!")
- }
- return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- }
-
- if !result.IsSuccess {
- log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ if result.ErrorCode != 0 {
+ log.Error("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("GetJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}
return &result, nil
diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go
index 9a05aa8ae1..471f8be7ef 100755
--- a/routers/api/v1/api.go
+++ b/routers/api/v1/api.go
@@ -934,6 +934,15 @@ func RegisterRoutes(m *macaron.Macaron) {
})
})
}, reqRepoReader(models.UnitTypeCloudBrain))
+ m.Group("/grampus", func() {
+ m.Get("/:id", repo.GetCloudbrainTask)
+ m.Group("/train-job", func() {
+ m.Group("/:jobid", func() {
+ m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
+ m.Get("/log", repo_ext.GrampusGetLog)
+ })
+ })
+ }, reqRepoReader(models.UnitTypeCloudBrain))
}, repoAssignment())
})
diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go
index 06e4bea44d..c6f4b8b264 100755
--- a/routers/api/v1/repo/modelarts.go
+++ b/routers/api/v1/repo/modelarts.go
@@ -199,7 +199,7 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
}
if job.StartTime == 0 && result.JobInfo.StartedAt > 0 {
- job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000)
+ job.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
}
job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
job.Duration = result.JobInfo.RunSec
diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go
index 7edb8c3bb5..69cd249011 100755
--- a/routers/repo/cloudbrain.go
+++ b/routers/repo/cloudbrain.go
@@ -1482,7 +1482,7 @@ func SyncCloudbrainStatus() {
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
- task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt / 1000)
+ task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
}
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index c8c54a9287..ab9d712905 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -5,6 +5,7 @@ import (
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/modelarts"
+ "code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/util"
"encoding/json"
"errors"
@@ -25,13 +26,13 @@ import (
)
const (
+ tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
+
//GPU
- tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
- tplGrampusTrainJobGPUShow base.TplName = "repo/grampus/trainjob/gpu/show"
+ tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
//NPU
- tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
- tplGrampusTrainJobNPUShow base.TplName = "repo/grampus/trainjob/npu/show"
+ tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
)
func GrampusTrainJobGPUNew(ctx *context.Context) {
@@ -368,10 +369,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
JobName: jobName,
DisplayJobName: displayJobName,
ComputeResource: models.NPUResource,
- Command: "echo \"test\"",
- ResourceSpecId: "modelarts.kat1.xlarge",
+ Command: "echo test",
+ ResourceSpecId: "f2497d54732b45fb8d887e63be1db4a7",
ImageUrl: "",
- ImageId: "tensorflow_1.15-cann_5.0.3-py_3.7-euler_2.8.3-aarch64",
+ ImageId: "e6e85cd78ca24e158f71b6fac9c2fb95",
DataUrl: dataPath,
Description: description,
@@ -412,3 +413,171 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}
+
+func GrampusStopJob(ctx *context.Context) {
+ var ID = ctx.Params(":jobid")
+ var resultCode = "0"
+ var errorMsg = ""
+ var status = ""
+
+ task := ctx.Cloudbrain
+ for {
+ if task.Status == string(models.GrampusStatusStopped) || task.Status == string(models.GrampusStatusFailed) || task.Status == string(models.GrampusStatusSucceeded) {
+ log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
+ resultCode = "-1"
+ errorMsg = "system error"
+ break
+ }
+
+ res, err := grampus.StopJob(task.JobID)
+ if err != nil {
+ log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
+ resultCode = strconv.Itoa(res.ErrorCode)
+ errorMsg = res.ErrorMsg
+ break
+ }
+
+ task.Status = string(models.GrampusStatusStopped)
+ if task.EndTime == 0 {
+ task.EndTime = timeutil.TimeStampNow()
+ }
+ task.ComputeAndSetDuration()
+ err = models.UpdateJob(task)
+ if err != nil {
+ log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
+ resultCode = "-1"
+ errorMsg = "system error"
+ break
+ }
+
+ status = task.Status
+ break
+ }
+
+ ctx.JSON(200, map[string]interface{}{
+ "result_code": resultCode,
+ "error_msg": errorMsg,
+ "status": status,
+ "id": ID,
+ "StatusOK": 0,
+ })
+}
+
+func GrampusTrainJobDel(ctx *context.Context) {
+ var listType = ctx.Query("listType")
+ if err := deleteGrampusJob(ctx); err != nil {
+ log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
+ ctx.ServerError(err.Error(), err)
+ return
+ }
+
+ var isAdminPage = ctx.Query("isadminpage")
+ var isHomePage = ctx.Query("ishomepage")
+ if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
+ ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
+ } else if isHomePage == "true" {
+ ctx.Redirect(setting.AppSubURL + "/cloudbrains")
+ } else {
+ ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
+ }
+}
+
+func deleteGrampusJob(ctx *context.Context) error {
+ task := ctx.Cloudbrain
+
+ if task.Status != string(models.GrampusStatusStopped) && task.Status != string(models.GrampusStatusSucceeded) && task.Status != string(models.GrampusStatusFailed) {
+ log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
+ return errors.New("the job has not been stopped")
+ }
+
+ err := models.DeleteJob(task)
+ if err != nil {
+ log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"])
+ return err
+ }
+
+ storageType := models.TypeCloudBrainOne
+ if task.ComputeResource == models.NPUResource {
+ storageType = models.TypeCloudBrainTwo
+ }
+ deleteJobStorage(task.JobName, storageType)
+
+ return nil
+}
+
+func GrampusTrainJobShow(ctx *context.Context) {
+ ctx.Data["PageIsCloudBrain"] = true
+ //debugListType := ctx.Query("debugListType")
+
+ var task *models.Cloudbrain
+ task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
+ if err != nil {
+ log.Error("GetCloudbrainByJobID failed:" + err.Error())
+ ctx.ServerError("system error", err)
+ return
+ }
+
+ attachment, err := models.GetAttachmentByUUID(task.Uuid)
+ if err == nil {
+ task.DatasetName = attachment.Name
+ }
+
+ taskList := make([]*models.Cloudbrain, 0)
+ taskList = append(taskList, task)
+ ctx.Data["version_list_task"] = taskList
+
+ if task.DeletedAt.IsZero() { //normal record
+ result, err := grampus.GetJob(task.JobID)
+ if err != nil {
+ log.Error("GetJob failed:" + err.Error())
+ ctx.ServerError("GetJob failed", err)
+ return
+ }
+
+ if result != nil {
+ task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
+ if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
+ task.Duration = result.JobInfo.RunSec
+ task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
+
+ if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
+ task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
+ }
+ if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
+ task.EndTime = task.StartTime.Add(task.Duration)
+ }
+ task.CorrectCreateUnix()
+ err = models.UpdateJob(task)
+ if err != nil {
+ log.Error("UpdateJob failed:" + err.Error())
+ }
+ }
+ }
+ }
+
+ ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
+}
+
+func GrampusGetLog(ctx *context.Context) {
+ jobID := ctx.Params(":jobid")
+ job, err := models.GetCloudbrainByJobID(jobID)
+ if err != nil {
+ log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
+ ctx.ServerError(err.Error(), err)
+ return
+ }
+
+ content, err := grampus.GetTrainJobLog(job.JobID)
+ if err != nil {
+ log.Error("GetJobLog failed: %v", err, ctx.Data["MsgID"])
+ ctx.ServerError(err.Error(), err)
+ return
+ }
+
+ ctx.JSON(http.StatusOK, map[string]interface{}{
+ "JobName": job.JobName,
+ "Content": content,
+ })
+
+ return
+}
diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go
index ea4ff3b1d2..6f0d2f3732 100755
--- a/routers/repo/modelarts.go
+++ b/routers/repo/modelarts.go
@@ -587,11 +587,6 @@ func TrainJobIndex(ctx *context.Context) {
for i, task := range tasks {
tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
- if task.Cloudbrain.Type == models.TypeCloudBrainOne {
- tasks[i].ComputeResource = models.GPUResource
- } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
- tasks[i].ComputeResource = models.NPUResource
- }
}
pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
diff --git a/routers/routes/routes.go b/routers/routes/routes.go
index 6348e4ba8b..64566a1d7c 100755
--- a/routers/routes/routes.go
+++ b/routers/routes/routes.go
@@ -1085,23 +1085,17 @@ func RegisterRoutes(m *macaron.Macaron) {
}, context.RepoRef())
m.Group("/grampus", func() {
m.Group("/train-job", func() {
+ m.Group("/:jobid", func() {
+ m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow)
+ m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob)
+ m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel)
+ m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
+ })
m.Group("/gpu", func() {
- m.Group("/:jobid", func() {
- m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow)
- m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
- m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel)
- m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
- })
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew)
//m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.GrampusTrainJobCreate)
})
m.Group("/npu", func() {
- m.Group("/:jobid", func() {
- m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow)
- m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.CloudBrainStop)
- m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel)
- m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
- })
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobNPUNew)
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), repo.GrampusTrainJobNpuCreate)
})
diff --git a/templates/repo/grampus/trainjob/gpu/show.tmpl b/templates/repo/grampus/trainjob/gpu/show.tmpl
deleted file mode 100755
index f1087abcfb..0000000000
--- a/templates/repo/grampus/trainjob/gpu/show.tmpl
+++ /dev/null
@@ -1,731 +0,0 @@
-{{template "base/head" .}}
-
-
-
- {{template "repo/header" .}}
-
-
- {{range $k ,$v := .version_list_task}}
-
-
-
-
-
-
-
-
-
- {{TimeSinceUnix1 .CreatedUnix}}
-
- {{$.i18n.Tr "repo.modelarts.status"}}:
- {{.Status}}
-
- {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}}:
- {{$.duration}}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
{{.VersionName}}
-
/
-
-
-
-
-
-
-
-
-
-
- {{end}} {{template "base/paginate" .}}
-
-
-
-
-
-
-
-
你确认删除该任务么?此任务一旦删除不可恢复。
-
-
-
-
-
-
-
-{{template "base/footer" .}}
-
-
\ No newline at end of file
diff --git a/templates/repo/grampus/trainjob/npu/show.tmpl b/templates/repo/grampus/trainjob/show.tmpl
similarity index 90%
rename from templates/repo/grampus/trainjob/npu/show.tmpl
rename to templates/repo/grampus/trainjob/show.tmpl
index 8f168fcf9f..e228b2ed96 100755
--- a/templates/repo/grampus/trainjob/npu/show.tmpl
+++ b/templates/repo/grampus/trainjob/show.tmpl
@@ -232,47 +232,15 @@
{{$.CsrfTokenHtml}}
- {{if and (.CanModify) (eq .Status "COMPLETED") ($.Permission.CanWrite $.UnitTypeModelManage) }}
-
- {{else}}
-
- {{end}}
-
- {{if .CanModify}}
-
- {{else}}
-
- {{end}}
-
- {{if .CanDel}}
-
- {{else}}
-
- {{end}}
-
-
- {{if .CanDel}}
-
- {{else}}
-
- {{end}}
+
- {{if not (eq .Cloudbrain.StartTime 0)}}
- {{TimeSinceUnix1 .Cloudbrain.StartTime}}
+ {{if not (eq .StartTime 0)}}
+ {{TimeSinceUnix1 .StartTime}}
{{else}}
- {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}}
+ {{TimeSinceUnix1 .CreatedUnix}}
{{end}}
{{$.i18n.Tr "repo.modelarts.current_version"}}:{{.VersionName}}
@@ -355,10 +323,10 @@
- {{if not (eq .Cloudbrain.StartTime 0)}}
- {{TimeSinceUnix1 .Cloudbrain.StartTime}}
+ {{if not (eq .StartTime 0)}}
+ {{TimeSinceUnix1 .StartTime}}
{{else}}
- {{TimeSinceUnix1 .Cloudbrain.CreatedUnix}}
+ {{TimeSinceUnix1 .CreatedUnix}}
{{end}}
@@ -464,8 +432,8 @@
- {{.Cloudbrain.Description}}
+ title="{{.Description}}">
+ {{.Description}}
@@ -479,15 +447,6 @@
-
-
-
-
-
-
@@ -701,7 +660,7 @@
return size + unitArr[index];
}
function refreshStatus(version_name) {
- $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}?version_name=${version_name}`, (data) => {
+ $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}?version_name=${version_name}`, (data) => {
// header status and duration
$(`#${version_name}-duration-span`).text(data.JobDuration)
$(`#${version_name}-status-span span`).text(data.JobStatus)
@@ -758,7 +717,7 @@
});
}
function loadLog(version_name) {
- $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&lines=50&order=asc`, (data) => {
+ $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&lines=50&order=asc`, (data) => {
$('input[name=end_line]').val(data.EndLine)
$('input[name=start_line]').val(data.StartLine)
$(`#log_file${version_name}`).text(data.Content)
@@ -959,7 +918,7 @@
let logContentDom = document.querySelector(`#log${version_name}`)
$(`#log_file${version_name}`).siblings('pre').remove()
- $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=asc`, (data) => {
+ $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=asc`, (data) => {
$(`#log${version_name} input[name=end_line]`).val(data.EndLine) //如果变动就改变所对应的值
$(`#log${version_name} input[name=start_line]`).val(data.StartLine)
@@ -977,12 +936,12 @@
let version_name = $(this).data('version')
let logContentDom = document.querySelector(`#log${version_name}`)
$(`#log_file${version_name}`).siblings('pre').remove()
- $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => {
+ $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => {
$(`#log${version_name} input[name=end_line]`).val(data.EndLine) //如果变动就改变所对应的值
$(`#log${version_name} input[name=start_line]`).val(data.StartLine)
$(`#log${version_name}`).append('
' + data.Content)
- $.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=${data.EndLine}&lines=50&order=desc`, (data) => {
+ $.get(`/api/v1/repos/${userName}/${repoPath}/grampus/train-job/${jobID}/log?version_name=${version_name}&base_line=${data.EndLine}&lines=50&order=desc`, (data) => {
if (data.Lines == 0) {
$(`.message${version_name} #header`).text('您已翻阅至日志底部')
$(`.message${version_name}`).css('display', 'block')
diff --git a/templates/repo/modelarts/trainjob/index.tmpl b/templates/repo/modelarts/trainjob/index.tmpl
index edb146d7ec..37a547c474 100755
--- a/templates/repo/modelarts/trainjob/index.tmpl
+++ b/templates/repo/modelarts/trainjob/index.tmpl
@@ -112,7 +112,7 @@
-
+
{{.DisplayJobName}}
@@ -153,7 +153,7 @@
-
+
{{$.CsrfTokenHtml}}
{{if .CanDel}}
--
2.34.1
From 6f642da6c323b70808360ff9d97a51d02d118a48 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Tue, 24 May 2022 19:51:40 +0800
Subject: [PATCH 06/56] opt
---
modules/grampus/grampus.go | 7 ++++---
routers/repo/grampus.go | 38 ++++++++++++++++++++++++++------------
2 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 71e368fa6f..5f580189d7 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -78,6 +78,7 @@ type GenerateTrainJobReq struct {
TotalVersionCount int
ComputeResource string
DatasetName string
+ Params string
}
func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
@@ -119,9 +120,9 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
//EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
- //Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
+ Parameters: req.Params,
+ BootFile: req.BootFile,
+ DataUrl: req.DataUrl,
//LogUrl: req.LogUrl,
//FlavorCode: req.FlavorCode,
Description: req.Description,
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index ab9d712905..bf07cb0797 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -236,8 +236,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
- //todo:del
- jobName = displayJobName
uuid := form.Attachment
description := form.Description
bootFile := form.BootFile
@@ -252,8 +250,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
VersionCount := modelarts.VersionCount
EngineName := form.EngineName
- log.Info(jobName)
-
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource)
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -387,10 +383,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//PoolID: poolID,
Uuid: uuid,
//Parameters: param,
- CommitID: commitID,
- IsLatestVersion: isLatestVersion,
- BranchName: branchName,
- //Params: form.Params,
+ CommitID: commitID,
+ IsLatestVersion: isLatestVersion,
+ BranchName: branchName,
+ Params: form.Params,
FlavorName: FlavorName,
EngineName: EngineName,
VersionCount: VersionCount,
@@ -507,7 +503,6 @@ func deleteGrampusJob(ctx *context.Context) error {
func GrampusTrainJobShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
- //debugListType := ctx.Query("debugListType")
var task *models.Cloudbrain
task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
@@ -522,9 +517,24 @@ func GrampusTrainJobShow(ctx *context.Context) {
task.DatasetName = attachment.Name
}
- taskList := make([]*models.Cloudbrain, 0)
- taskList = append(taskList, task)
- ctx.Data["version_list_task"] = taskList
+ if len(task.Parameters) > 0 {
+ var parameters models.Parameters
+ err := json.Unmarshal([]byte(task.Parameters), ¶meters)
+ if err != nil {
+ log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
+ ctx.ServerError("system error", err)
+ return
+ }
+
+ if len(parameters.Parameter) > 0 {
+ paramTemp := ""
+ for _, Parameter := range parameters.Parameter {
+ param := Parameter.Label + " = " + Parameter.Value + "; "
+ paramTemp = paramTemp + param
+ }
+ task.Parameters = paramTemp[:len(paramTemp)-2]
+ }
+ }
if task.DeletedAt.IsZero() { //normal record
result, err := grampus.GetJob(task.JobID)
@@ -555,6 +565,10 @@ func GrampusTrainJobShow(ctx *context.Context) {
}
}
+ taskList := make([]*models.Cloudbrain, 0)
+ taskList = append(taskList, task)
+ ctx.Data["version_list_task"] = taskList
+
ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
}
--
2.34.1
From 10e261c2e8a760a3e849952c0be4061da269c000 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Wed, 25 May 2022 18:34:37 +0800
Subject: [PATCH 07/56] create
---
models/cloudbrain.go | 28 +++
modules/auth/grampus.go | 22 +-
modules/grampus/grampus.go | 48 ++---
modules/grampus/resty.go | 59 ++++--
routers/repo/grampus.go | 210 ++++++++-----------
templates/repo/grampus/trainjob/npu/new.tmpl | 31 +--
6 files changed, 199 insertions(+), 199 deletions(-)
diff --git a/models/cloudbrain.go b/models/cloudbrain.go
index 6a6645d6b4..86af80235a 100755
--- a/models/cloudbrain.go
+++ b/models/cloudbrain.go
@@ -146,6 +146,7 @@ type Cloudbrain struct {
PreVersionName string //父版本名称
ComputeResource string //计算资源,例如npu
EngineID int64 //引擎id
+ ImageID string //grampus image_id
TrainUrl string //输出模型的obs路径
BranchName string //分支名称
@@ -1180,6 +1181,33 @@ type GrampusJobInfo struct {
Tasks []GrampusTasks `json:"tasks"`
}
+type GrampusSpec struct {
+ CreatedAt int64 `json:"createdAt"`
+ UpdatedAt int64 `json:"updatedAt"`
+ ID string `json:"id"`
+ Name string `json:"name"`
+ ProcessorType string `json:"processorType"`
+}
+
+type GetGrampusResourceSpecsResult struct {
+ GrampusResult
+ Infos []GrampusSpec `json:"resourceSpecs"`
+}
+
+type GrampusImage struct {
+ CreatedAt int64 `json:"createdAt"`
+ UpdatedAt int64 `json:"updatedAt"`
+ ID string `json:"id"`
+ Name string `json:"name"`
+ ProcessorType string `json:"processorType"`
+}
+
+type GetGrampusImagesResult struct {
+ GrampusResult
+ TotalSize int `json:"totalSize"`
+ Infos []GrampusImage `json:"images"`
+}
+
type CreateGrampusJobResponse struct {
GrampusResult
JobInfo GrampusJobInfo `json:"otJob"`
diff --git a/modules/auth/grampus.go b/modules/auth/grampus.go
index 2cfaf70061..b92d8d06d7 100755
--- a/modules/auth/grampus.go
+++ b/modules/auth/grampus.go
@@ -6,16 +6,18 @@ import (
)
type CreateGrampusTrainJobForm struct {
- DisplayJobName string `form:"display_job_name" binding:"Required"`
- JobName string `form:"job_name" binding:"Required"`
- Attachment string `form:"attachment" binding:"Required"`
- BootFile string `form:"boot_file" binding:"Required"`
- Flavor string `form:"flavor" binding:"Required"`
- Params string `form:"run_para_list" binding:"Required"`
- Description string `form:"description"`
- BranchName string `form:"branch_name" binding:"Required"`
- FlavorName string `form:"flaver_names" binding:"Required"`
- EngineName string `form:"engine_names" binding:"Required"`
+ DisplayJobName string `form:"display_job_name" binding:"Required"`
+ JobName string `form:"job_name" binding:"Required"`
+ Attachment string `form:"attachment" binding:"Required"`
+ BootFile string `form:"boot_file" binding:"Required"`
+ ImageID string `form:"image_id" binding:"Required"`
+ FlavorID string `form:"flavor" binding:"Required"`
+ Params string `form:"run_para_list" binding:"Required"`
+ Description string `form:"description"`
+ BranchName string `form:"branch_name" binding:"Required"`
+ FlavorName string `form:"flaver_names" binding:"Required"`
+ EngineName string `form:"engine_names" binding:"Required"`
+ WorkServerNumber int `form:"work_server_number" binding:"Required"`
}
func (f *CreateGrampusTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 5f580189d7..13280cac33 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -42,6 +42,9 @@ const (
SortByCreateTime = "create_time"
ConfigTypeCustom = "custom"
TotalVersionCount = 1
+
+ ProcessorTypeNPU = "npu.huawei.com/NPU"
+ ProcessorTypeGPU = "nvidia.com/gpu"
)
var (
@@ -54,7 +57,7 @@ type GenerateTrainJobReq struct {
JobName string
Command string
ResourceSpecId string
- ImageUrl string
+ ImageUrl string //与image_id二选一,都有的情况下优先image_url
ImageId string
DisplayJobName string
@@ -102,29 +105,26 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
jobID := jobResult.JobInfo.JobID
err = models.CreateCloudbrain(&models.Cloudbrain{
- Status: TransTrainJobStatus(jobResult.JobInfo.Status),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobID,
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeTrain),
- Type: models.TypeCloudBrainGrampus,
- //VersionID: jobResult.VersionID,
- //VersionName: jobResult.VersionName,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- IsLatestVersion: req.IsLatestVersion,
- ComputeResource: req.ComputeResource,
- //EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- //LogUrl: req.LogUrl,
- //FlavorCode: req.FlavorCode,
+ Status: TransTrainJobStatus(jobResult.JobInfo.Status),
+ UserID: ctx.User.ID,
+ RepoID: ctx.Repo.Repository.ID,
+ JobID: jobID,
+ JobName: req.JobName,
+ DisplayJobName: req.DisplayJobName,
+ JobType: string(models.JobTypeTrain),
+ Type: models.TypeCloudBrainGrampus,
+ Uuid: req.Uuid,
+ DatasetName: req.DatasetName,
+ CommitID: req.CommitID,
+ IsLatestVersion: req.IsLatestVersion,
+ ComputeResource: req.ComputeResource,
+ ImageID: req.ImageId,
+ TrainUrl: req.TrainUrl,
+ BranchName: req.BranchName,
+ Parameters: req.Params,
+ BootFile: req.BootFile,
+ DataUrl: req.DataUrl,
+ FlavorCode: req.ResourceSpecId,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go
index 0d87f390d0..183afb853b 100755
--- a/modules/grampus/resty.go
+++ b/modules/grampus/resty.go
@@ -22,7 +22,8 @@ const (
urlGetToken = urlOpenApiV1 + "token"
urlTrainJob = urlOpenApiV1 + "trainjob"
- urlResourceSpecs = "/job/resource-specs"
+ urlGetResourceSpecs = urlOpenApiV1 + "resourcespec"
+ urlGetImages = urlOpenApiV1 + "image"
urlTrainJobConfig = "/training-job-configs"
errorCodeExceedLimit = "ModelArts.0118"
@@ -155,43 +156,65 @@ sendjob:
return &result, nil
}
-func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
+func GetResourceSpecs(processorType string) (*models.GetGrampusResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
- var result models.GetResourceSpecsResult
+ var result models.GetGrampusResourceSpecsResult
retry := 0
sendjob:
- res, err := client.R().
- SetHeader("Content-Type", "application/json").
+ _, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
- Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs)
+ Get(HOST + urlGetResourceSpecs + "?processorType=" + processorType)
if err != nil {
return nil, fmt.Errorf("resty GetResourceSpecs: %v", err)
}
- if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
+ if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
+ log.Info("retry get token")
_ = getToken()
goto sendjob
}
- if res.StatusCode() != http.StatusOK {
- var temp models.ErrorResult
- if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
- log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
- }
- log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
- return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
+ if result.ErrorCode != 0 {
+ log.Error("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("GetResourceSpecs failed(%d): %s", result.ErrorCode, result.ErrorMsg)
+ }
+
+ return &result, nil
+}
+
+func GetImages(processorType string) (*models.GetGrampusImagesResult, error) {
+ checkSetting()
+ client := getRestyClient()
+ var result models.GetGrampusImagesResult
+
+ retry := 0
+
+sendjob:
+ _, err := client.R().
+ SetAuthToken(TOKEN).
+ SetResult(&result).
+ Get(HOST + urlGetImages + "?processorType=" + processorType)
+
+ if err != nil {
+ return nil, fmt.Errorf("resty GetImages: %v", err)
}
- if !result.IsSuccess {
- log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
- return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
+ if result.ErrorCode == errorIllegalToken && retry < 1 {
+ retry++
+ log.Info("retry get token")
+ _ = getToken()
+ goto sendjob
+ }
+
+ if result.ErrorCode != 0 {
+ log.Error("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg)
+ return &result, fmt.Errorf("GetImages failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}
return &result, nil
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index bf07cb0797..3f4b1361cd 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -135,18 +135,13 @@ func grampusGpuNewDataPrepare(ctx *context.Context) error {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec
- ctx.Data["params"] = ""
- ctx.Data["branchName"] = ctx.Repo.BranchName
-
- ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath
- ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled
-
- ctx.Data["brainscore_path"] = cloudbrain.BrainScoreMountPath
- ctx.Data["is_brainscore_enabled"] = setting.IsBrainScoreEnabled
- ctx.Data["cloudbraintype"] = models.TypeCloudBrainOne
-
- ctx.Data["benchmarkMode"] = ctx.Query("benchmarkMode")
+ branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
+ if err != nil {
+ log.Error("GetBranches error:", err)
+ }
+ ctx.Data["branches"] = branches
+ ctx.Data["branchName"] = ctx.Repo.BranchName
return nil
}
@@ -170,51 +165,37 @@ func grampusTrainJobNpuNewDataPrepare(ctx *context.Context) error {
//get valid dataset
attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- //get valid resource specs
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
+ log.Error("GetModelArtsTrainAttachments failed:", err.Error())
+ } else {
+ ctx.Data["attachments"] = attachs
}
- ctx.Data["resource_pools"] = resourcePools.Info
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
+ //get valid engines
+ images, err := grampus.GetImages(grampus.ProcessorTypeNPU)
+ if err != nil {
+ log.Error("GetResourceSpecs failed:", err.Error())
+ } else {
+ ctx.Data["engine_versions"] = images.Infos
}
- ctx.Data["engines"] = engines.Info
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
+ //get valid resource specs
+ specs, err := grampus.GetResourceSpecs(grampus.ProcessorTypeNPU)
+ if err != nil {
+ log.Error("GetResourceSpecs failed:", err.Error())
+ } else {
+ ctx.Data["flavor_infos"] = specs.Infos
}
- ctx.Data["engine_versions"] = versionInfos.Version
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
+ //get branches
+ branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
+ if err != nil {
+ log.Error("GetBranches error:", err.Error())
+ } else {
+ ctx.Data["branches"] = branches
}
- ctx.Data["flavor_infos"] = flavorInfos.Info
- ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
- ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
-
return nil
}
@@ -246,10 +227,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
- FlavorName := form.FlavorName
- VersionCount := modelarts.VersionCount
- EngineName := form.EngineName
+ flavorName := form.FlavorName
+ versionCount := modelarts.VersionCount
+ engineName := form.EngineName
+ //check count limit
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource)
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
@@ -265,12 +247,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
}
+ //check param
if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
grampusTrainJobNpuNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
return
}
+
//check whether the task name in the project is duplicated
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
if err == nil {
@@ -295,9 +279,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
os.RemoveAll(codeLocalPath)
}
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branchName)
-
if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
@@ -321,7 +302,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
- // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form)
@@ -330,9 +310,9 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//prepare command
//todo: download code, download dataset, unzip dataset, exec code, upload model
+ command, err := generateCommand(grampus.ProcessorTypeNPU, codeObsPath, dataPath, params, "")
var parameters models.Parameters
param := make([]models.Parameter, 0)
- existDeviceTarget := false
if len(params) != 0 {
err := json.Unmarshal([]byte(params), ¶meters)
if err != nil {
@@ -343,63 +323,45 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
for _, parameter := range parameters.Parameter {
- if parameter.Label == modelarts.DeviceTarget {
- existDeviceTarget = true
- }
- if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
+ param = append(param, models.Parameter{
+ Label: parameter.Label,
+ Value: parameter.Value,
+ })
}
}
- if !existDeviceTarget {
- param = append(param, models.Parameter{
- Label: modelarts.DeviceTarget,
- Value: modelarts.Ascend,
- })
- }
+ param = append(param, models.Parameter{
+ Label: modelarts.DeviceTarget,
+ Value: modelarts.Ascend,
+ })
+
+ gitRepo, _ := git.OpenRepository(repo.RepoPath())
+ commitID, _ := gitRepo.GetBranchCommitID(branchName)
req := &grampus.GenerateTrainJobReq{
- JobName: jobName,
- DisplayJobName: displayJobName,
- ComputeResource: models.NPUResource,
- Command: "echo test",
- ResourceSpecId: "f2497d54732b45fb8d887e63be1db4a7",
- ImageUrl: "",
- ImageId: "e6e85cd78ca24e158f71b6fac9c2fb95",
-
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- //TrainUrl: outputObsPath,
- //FlavorCode: flavorCode,
- WorkServerNumber: 1,
- //EngineID: int64(engineID),
- //LogUrl: logObsPath,
- //PoolID: poolID,
- Uuid: uuid,
- //Parameters: param,
+ JobName: jobName,
+ DisplayJobName: displayJobName,
+ ComputeResource: models.NPUResource,
+ Command: command,
+ ResourceSpecId: form.FlavorID,
+ ImageUrl: "",
+ ImageId: form.ImageID,
+ DataUrl: dataPath,
+ Description: description,
+ CodeObsPath: codeObsPath,
+ BootFileUrl: codeObsPath + bootFile,
+ BootFile: bootFile,
+ WorkServerNumber: form.WorkServerNumber,
+ Uuid: uuid,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
BranchName: branchName,
Params: form.Params,
- FlavorName: FlavorName,
- EngineName: EngineName,
- VersionCount: VersionCount,
+ FlavorName: flavorName,
+ EngineName: engineName,
+ VersionCount: versionCount,
TotalVersionCount: modelarts.TotalVersionCount,
}
- //将params转换Parameters.Parameter,出错时返回给前端
- var Parameters modelarts.Parameters
- if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return
- }
-
err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
@@ -517,25 +479,6 @@ func GrampusTrainJobShow(ctx *context.Context) {
task.DatasetName = attachment.Name
}
- if len(task.Parameters) > 0 {
- var parameters models.Parameters
- err := json.Unmarshal([]byte(task.Parameters), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
- ctx.ServerError("system error", err)
- return
- }
-
- if len(parameters.Parameter) > 0 {
- paramTemp := ""
- for _, Parameter := range parameters.Parameter {
- param := Parameter.Label + " = " + Parameter.Value + "; "
- paramTemp = paramTemp + param
- }
- task.Parameters = paramTemp[:len(paramTemp)-2]
- }
- }
-
if task.DeletedAt.IsZero() { //normal record
result, err := grampus.GetJob(task.JobID)
if err != nil {
@@ -565,6 +508,25 @@ func GrampusTrainJobShow(ctx *context.Context) {
}
}
+ if len(task.Parameters) > 0 {
+ var parameters models.Parameters
+ err := json.Unmarshal([]byte(task.Parameters), ¶meters)
+ if err != nil {
+ log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
+ ctx.ServerError("system error", err)
+ return
+ }
+
+ if len(parameters.Parameter) > 0 {
+ paramTemp := ""
+ for _, Parameter := range parameters.Parameter {
+ param := Parameter.Label + " = " + Parameter.Value + "; "
+ paramTemp = paramTemp + param
+ }
+ task.Parameters = paramTemp[:len(paramTemp)-2]
+ }
+ }
+
taskList := make([]*models.Cloudbrain, 0)
taskList = append(taskList, task)
ctx.Data["version_list_task"] = taskList
@@ -595,3 +557,13 @@ func GrampusGetLog(ctx *context.Context) {
return
}
+
+func generateCommand(processorType, codePath, dataPath, params, outputPath string) (string, error) {
+ var command string
+ //download code
+ //download dataset
+ //unzip dataset
+ //exec code
+ //upload models
+ return command, nil
+}
diff --git a/templates/repo/grampus/trainjob/npu/new.tmpl b/templates/repo/grampus/trainjob/npu/new.tmpl
index 6f5f5455f0..9e5ba39bcc 100755
--- a/templates/repo/grampus/trainjob/npu/new.tmpl
+++ b/templates/repo/grampus/trainjob/npu/new.tmpl
@@ -136,18 +136,10 @@
{{.i18n.Tr "repo.modelarts.train_job.AI_driver"}}
-
-
- {{range .engines}}
- {{.Value}}
- {{end}}
-
-
-
-
+
{{range .engine_versions}}
- {{.Value}}
+ {{.Name}}
{{end}}
@@ -175,22 +167,6 @@
{{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
- {{if ne 0 (len .params)}}
- {{range $k ,$v := .params}}
-
- {{end}}
- {{end}}
@@ -224,7 +200,7 @@
{{.i18n.Tr "repo.modelarts.train_job.standard"}}
{{range .flavor_infos}}
- {{.Value}}
+ {{.Name}}
{{end}}
@@ -237,7 +213,6 @@
1
- 2
--
2.34.1
From 685a14ba1930c5643a352326b4c47d40ae4fe2a5 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Thu, 26 May 2022 20:54:02 +0800
Subject: [PATCH 08/56] generate command
---
modules/grampus/grampus.go | 15 +++++------
modules/util/path.go | 10 ++++++++
routers/repo/grampus.go | 51 +++++++++++++++++++++++++++++++-------
3 files changed, 58 insertions(+), 18 deletions(-)
mode change 100644 => 100755 modules/util/path.go
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 13280cac33..bde663ebf6 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -10,17 +10,11 @@ import (
)
const (
- //notebook
- storageTypeOBS = "obs"
- autoStopDuration = 4 * 60 * 60
- autoStopDurationMs = 4 * 60 * 60 * 1000
-
- DataSetMountPath = "/home/ma-user/work"
- NotebookEnv = "Python3"
- NotebookType = "Ascend"
- FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
+ storageTypeOBS = "obs"
+ WorkPath = "/home/ma-user/work"
CodePath = "/code/"
+ DatasetPath = "/dataset"
OutputPath = "/output/"
ResultPath = "/result/"
LogPath = "/log/"
@@ -45,6 +39,9 @@ const (
ProcessorTypeNPU = "npu.huawei.com/NPU"
ProcessorTypeGPU = "nvidia.com/gpu"
+
+ CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;"
+ ScriptSyncObsCodeAndDataset = "sync_obs_code_and_dataset.py"
)
var (
diff --git a/modules/util/path.go b/modules/util/path.go
old mode 100644
new mode 100755
index 2b198eb6dc..1db6e43793
--- a/modules/util/path.go
+++ b/modules/util/path.go
@@ -31,3 +31,13 @@ func GetDirectorySize(path string) (int64, error) {
})
return size, err
}
+
+// check whether the path is dir
+func IsDir(path string) bool {
+ s, err := os.Stat(path)
+ if err != nil {
+ return false
+ }
+
+ return s.IsDir()
+}
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index 3f4b1361cd..c16363da49 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -9,6 +9,7 @@ import (
"code.gitea.io/gitea/modules/util"
"encoding/json"
"errors"
+ "fmt"
"io/ioutil"
"net/http"
"os"
@@ -273,6 +274,15 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
}
+ //check dataset
+ attachment, err := models.GetAttachmentByUUID(uuid)
+ if err != nil {
+ log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"])
+ grampusTrainJobNpuNewDataPrepare(ctx)
+ ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form)
+ return
+ }
+
//prepare code and out path
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
@@ -310,7 +320,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//prepare command
//todo: download code, download dataset, unzip dataset, exec code, upload model
- command, err := generateCommand(grampus.ProcessorTypeNPU, codeObsPath, dataPath, params, "")
+ command, err := generateCommand(grampus.ProcessorTypeNPU, "obs:/"+codeObsPath, "obs:/"+dataPath, params, "", attachment.Name)
+ log.Info(command)
var parameters models.Parameters
param := make([]models.Parameter, 0)
if len(params) != 0 {
@@ -360,6 +371,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
EngineName: engineName,
VersionCount: versionCount,
TotalVersionCount: modelarts.TotalVersionCount,
+ DatasetName: attachment.Name,
}
err = grampus.GenerateTrainJob(ctx, req)
@@ -474,11 +486,6 @@ func GrampusTrainJobShow(ctx *context.Context) {
return
}
- attachment, err := models.GetAttachmentByUUID(task.Uuid)
- if err == nil {
- task.DatasetName = attachment.Name
- }
-
if task.DeletedAt.IsZero() { //normal record
result, err := grampus.GetJob(task.JobID)
if err != nil {
@@ -524,6 +531,8 @@ func GrampusTrainJobShow(ctx *context.Context) {
paramTemp = paramTemp + param
}
task.Parameters = paramTemp[:len(paramTemp)-2]
+ } else {
+ task.Parameters = ""
}
}
@@ -558,12 +567,36 @@ func GrampusGetLog(ctx *context.Context) {
return
}
-func generateCommand(processorType, codePath, dataPath, params, outputPath string) (string, error) {
+func generateCommand(processorType, codeObsPath, dataObsPath, params, outputPath, datasetName string) (string, error) {
var command string
- //download code
- //download dataset
+
+ command += grampus.CommandPrepareScript
+ //download code & dataset
+ if processorType == grampus.ProcessorTypeNPU {
+ commandDownload := "python " + grampus.ScriptSyncObsCodeAndDataset + " --access_key=" + setting.AccessKeyID + " --secret_key=" + setting.SecretAccessKey + " --project_id=" + setting.ProjectID + " --region_name=" + setting.Location + " --code_obs_dir=" + codeObsPath + " --data_obs_dir=" + dataObsPath + " --dataset_name=" + datasetName + ";"
+ command += commandDownload
+ } else if processorType == grampus.ProcessorTypeGPU {
+
+ }
+
//unzip dataset
//exec code
//upload models
return command, nil
}
+
+func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, error) {
+ var command string
+
+ command = "python;"
+ command += "from modelarts.session import Session \n"
+ command += fmt.Sprintf("session = Session(access_key='%s',secret_key='%s', project_id='%s', region_name='%s') \n", setting.AccessKeyID, setting.SecretAccessKey, setting.ProjectID, setting.Location)
+
+ if util.IsDir(srcObsFile) {
+ command += fmt.Sprintf("session.obs.download_dir(src_obs_dir=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir)
+ } else {
+ command += fmt.Sprintf("session.obs.download_file(src_obs_file=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir)
+ }
+
+ return command, nil
+}
--
2.34.1
From d2ed49fbacee9bd920adf5f06e42a1341a4e6739 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Mon, 30 May 2022 16:01:43 +0800
Subject: [PATCH 09/56] gen command
---
modules/grampus/grampus.go | 6 +-
routers/repo/grampus.go | 115 ++++++++++++++++++++++++++-----------
2 files changed, 85 insertions(+), 36 deletions(-)
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index bde663ebf6..45ff979a5a 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -18,7 +18,7 @@ const (
OutputPath = "/output/"
ResultPath = "/result/"
LogPath = "/log/"
- JobPath = "/job/"
+ JobPath = "job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 500
@@ -40,8 +40,8 @@ const (
ProcessorTypeNPU = "npu.huawei.com/NPU"
ProcessorTypeGPU = "nvidia.com/gpu"
- CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;"
- ScriptSyncObsCodeAndDataset = "sync_obs_code_and_dataset.py"
+ CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm;"
+ CodeArchiveName = "master.zip"
)
var (
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index c16363da49..d3c8642ba5 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -10,6 +10,7 @@ import (
"encoding/json"
"errors"
"fmt"
+ "github.com/unknwon/com"
"io/ioutil"
"net/http"
"os"
@@ -224,8 +225,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
params := form.Params
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
- codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
- dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
+ codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
+ dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
flavorName := form.FlavorName
@@ -289,8 +290,8 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
os.RemoveAll(codeLocalPath)
}
- if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
- log.Error("downloadCode failed, server timed out: %s (%v)", repo.FullName(), err)
+ if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
+ log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form)
return
@@ -320,33 +321,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//prepare command
//todo: download code, download dataset, unzip dataset, exec code, upload model
- command, err := generateCommand(grampus.ProcessorTypeNPU, "obs:/"+codeObsPath, "obs:/"+dataPath, params, "", attachment.Name)
+ command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, "", attachment.Name)
log.Info(command)
- var parameters models.Parameters
- param := make([]models.Parameter, 0)
- if len(params) != 0 {
- err := json.Unmarshal([]byte(params), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- grampusTrainJobNpuNewDataPrepare(ctx)
- ctx.RenderWithErr("运行参数错误", tplGrampusTrainJobNPUNew, &form)
- return
- }
- for _, parameter := range parameters.Parameter {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
- }
- param = append(param, models.Parameter{
- Label: modelarts.DeviceTarget,
- Value: modelarts.Ascend,
- })
-
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branchName)
+ commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
req := &grampus.GenerateTrainJobReq{
JobName: jobName,
@@ -356,7 +334,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
ResourceSpecId: form.FlavorID,
ImageUrl: "",
ImageId: form.ImageID,
- DataUrl: dataPath,
+ DataUrl: dataObsPath,
Description: description,
CodeObsPath: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
@@ -567,21 +545,47 @@ func GrampusGetLog(ctx *context.Context) {
return
}
-func generateCommand(processorType, codeObsPath, dataObsPath, params, outputPath, datasetName string) (string, error) {
+func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile, paramSrc, outputPath, datasetName string) (string, error) {
var command string
command += grampus.CommandPrepareScript
//download code & dataset
if processorType == grampus.ProcessorTypeNPU {
- commandDownload := "python " + grampus.ScriptSyncObsCodeAndDataset + " --access_key=" + setting.AccessKeyID + " --secret_key=" + setting.SecretAccessKey + " --project_id=" + setting.ProjectID + " --region_name=" + setting.Location + " --code_obs_dir=" + codeObsPath + " --data_obs_dir=" + dataObsPath + " --dataset_name=" + datasetName + ";"
+ commandDownload := "./sync_for_arm " + setting.Bucket + " " + codeObsPath + " " + grampus.CodeArchiveName + " " + dataObsPath + " " + datasetName + ";"
command += commandDownload
} else if processorType == grampus.ProcessorTypeGPU {
}
- //unzip dataset
+ //unzip code & dataset
+ commandUnzip := "cd dataset;unzip " + datasetName + ";cd ../code;unzip master.zip;"
+ command += commandUnzip
+
//exec code
+ var parameters models.Parameters
+ var paramCode string
+ param := make([]models.Parameter, 0)
+ if len(paramSrc) != 0 {
+ err := json.Unmarshal([]byte(paramSrc), ¶meters)
+ if err != nil {
+ log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
+ return command, err
+ }
+
+ for _, parameter := range parameters.Parameter {
+ param = append(param, models.Parameter{
+ Label: parameter.Label,
+ Value: parameter.Value,
+ })
+ paramCode += " --" + parameter.Label + "=" + parameter.Value
+ }
+ }
+
+ commandCode := "cd " + repoName + ";python " + bootFile + paramCode
+ command += commandCode
+
//upload models
+
return command, nil
}
@@ -600,3 +604,48 @@ func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, err
return command, nil
}
+
+func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
+ archiveType := git.ZIP
+ archivePath := codePath
+
+ if !com.IsDir(archivePath) {
+ if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
+ log.Error("MkdirAll failed:" + err.Error())
+ return err
+ }
+ }
+
+ // Get corresponding commit.
+ var (
+ commit *git.Commit
+ err error
+ )
+
+ gitRepo := ctx.Repo.GitRepo
+ if err != nil {
+ log.Error("OpenRepository failed:" + err.Error())
+ return err
+ }
+
+ if gitRepo.IsBranchExist(branchName) {
+ commit, err = gitRepo.GetBranchCommit(branchName)
+ if err != nil {
+ log.Error("GetBranchCommit failed:" + err.Error())
+ return err
+ }
+ }
+
+ archivePath = path.Join(archivePath, grampus.CodeArchiveName)
+ if !com.IsFile(archivePath) {
+ if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
+ Format: archiveType,
+ Prefix: setting.Repository.PrefixArchiveFiles,
+ }); err != nil {
+ log.Error("CreateArchive failed:" + err.Error())
+ return err
+ }
+ }
+
+ return nil
+}
--
2.34.1
From 8eeaf779354419cef77cac29f6435839154a4d77 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Mon, 30 May 2022 20:14:42 +0800
Subject: [PATCH 10/56] debug
---
modules/grampus/grampus.go | 2 +-
routers/repo/grampus.go | 20 +++++++-------------
2 files changed, 8 insertions(+), 14 deletions(-)
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 45ff979a5a..4c46296e5d 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -40,7 +40,7 @@ const (
ProcessorTypeNPU = "npu.huawei.com/NPU"
ProcessorTypeGPU = "nvidia.com/gpu"
- CommandPrepareScript = "pwd;cd /tmp;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm;"
+ CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;"
CodeArchiveName = "master.zip"
)
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index d3c8642ba5..552fc9b705 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -216,7 +216,6 @@ func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error
}
func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
- VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
uuid := form.Attachment
@@ -298,20 +297,13 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
//todo: upload code (send to file_server todo this work?)
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
+ if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form)
return
}
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- grampusTrainJobNpuNewDataPrepare(ctx)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplGrampusTrainJobNPUNew, &form)
- return
- }
-
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusTrainJobNpuNewDataPrepare(ctx)
@@ -321,7 +313,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//prepare command
//todo: download code, download dataset, unzip dataset, exec code, upload model
- command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, "", attachment.Name)
+ command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name)
log.Info(command)
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
@@ -545,7 +537,7 @@ func GrampusGetLog(ctx *context.Context) {
return
}
-func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile, paramSrc, outputPath, datasetName string) (string, error) {
+func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile, paramSrc, outputObsPath, datasetName string) (string, error) {
var command string
command += grampus.CommandPrepareScript
@@ -558,7 +550,7 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile
}
//unzip code & dataset
- commandUnzip := "cd dataset;unzip " + datasetName + ";cd ../code;unzip master.zip;"
+ commandUnzip := "cd /tmp/dataset;unzip " + datasetName + ";cd /tmp/code;unzip master.zip;"
command += commandUnzip
//exec code
@@ -581,10 +573,12 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile
}
}
- commandCode := "cd " + repoName + ";python " + bootFile + paramCode
+ commandCode := "cd " + repoName + ";python " + bootFile + paramCode + ";"
command += commandCode
//upload models
+ commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_grampus " + setting.Bucket + " " + outputObsPath + " " + "/tmp/output/;"
+ command += commandUpload
return command, nil
}
--
2.34.1
From e46202f0acbc6d74981f461eb8d2ad5489579985 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Mon, 30 May 2022 20:54:08 +0800
Subject: [PATCH 11/56] debug
---
modules/grampus/grampus.go | 2 +-
routers/repo/grampus.go | 11 +++++++----
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 4c46296e5d..95985e533b 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -40,7 +40,7 @@ const (
ProcessorTypeNPU = "npu.huawei.com/NPU"
ProcessorTypeGPU = "nvidia.com/gpu"
- CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;"
+ CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;"
CodeArchiveName = "master.zip"
)
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index 552fc9b705..542b1c3865 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -312,9 +312,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
//prepare command
- //todo: download code, download dataset, unzip dataset, exec code, upload model
command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name)
- log.Info(command)
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
@@ -550,7 +548,12 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile
}
//unzip code & dataset
- commandUnzip := "cd /tmp/dataset;unzip " + datasetName + ";cd /tmp/code;unzip master.zip;"
+ toolUnzip := "unzip -q "
+ if strings.HasSuffix(datasetName, ".tar.gz") {
+ toolUnzip = "tar -zxvf "
+ }
+ commandUnzip := "cd /tmp/dataset;" + toolUnzip + datasetName + ";cd /tmp/code;unzip -q master.zip;"
+ commandUnzip += "cd /tmp/dataset/" + strings.TrimSuffix(datasetName, ".zip") + ";ls;"
command += commandUnzip
//exec code
@@ -573,7 +576,7 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile
}
}
- commandCode := "cd " + repoName + ";python " + bootFile + paramCode + ";"
+ commandCode := "cd /tmp/code/" + repoName + ";python " + bootFile + paramCode + ";"
command += commandCode
//upload models
--
2.34.1
From 99b2c851255c8bad881300517a8feeb59cf072f1 Mon Sep 17 00:00:00 2001
From: lewis <747342561@qq.com>
Date: Tue, 31 May 2022 18:02:30 +0800
Subject: [PATCH 12/56] view
---
models/cloudbrain.go | 14 +++++---
modules/grampus/grampus.go | 3 +-
modules/grampus/resty.go | 2 +-
options/locale/locale_en-US.ini | 5 +++
options/locale/locale_zh-CN.ini | 5 +++
routers/api/v1/api.go | 2 +-
routers/repo/cloudbrain.go | 3 ++
routers/repo/grampus.go | 38 ++++++++++----------
routers/routes/routes.go | 2 +-
templates/repo/cloudbrain/trainjob/new.tmpl | 13 +++++++
templates/repo/grampus/trainjob/gpu/new.tmpl | 13 +++++++
templates/repo/grampus/trainjob/npu/new.tmpl | 13 +++++++
templates/repo/grampus/trainjob/show.tmpl | 13 ++++++-
templates/repo/modelarts/trainjob/new.tmpl | 13 +++++++
14 files changed, 110 insertions(+), 29 deletions(-)
diff --git a/models/cloudbrain.go b/models/cloudbrain.go
index 06fbea5b3f..97fa69e0d7 100755
--- a/models/cloudbrain.go
+++ b/models/cloudbrain.go
@@ -147,6 +147,7 @@ type Cloudbrain struct {
ComputeResource string //计算资源,例如npu
EngineID int64 //引擎id
ImageID string //grampus image_id
+ AiCenter string //grampus ai center: center_id+center_name
TrainUrl string //输出模型的obs路径
BranchName string //分支名称
@@ -1224,11 +1225,14 @@ type GrampusStopJobResponse struct {
}
type GrampusTasks struct {
- Command string `json:"command"`
- Name string `json:"name"`
- ImageId string `json:"imageId"`
- ResourceSpecId string `json:"resourceSpecId"`
- ImageUrl string `json:"imageUrl"`
+ Command string `json:"command"`
+ Name string `json:"name"`
+ ImageId string `json:"imageId"`
+ ResourceSpecId string `json:"resourceSpecId"`
+ ImageUrl string `json:"imageUrl"`
+ CenterID []string `json:"centerID"`
+ CenterName []string `json:"centerName"`
+ ReplicaNum int `json:"replicaNum"`
}
type CreateGrampusJobRequest struct {
diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go
index 95985e533b..a0f398115f 100755
--- a/modules/grampus/grampus.go
+++ b/modules/grampus/grampus.go
@@ -40,7 +40,7 @@ const (
ProcessorTypeNPU = "npu.huawei.com/NPU"
ProcessorTypeGPU = "nvidia.com/gpu"
- CommandPrepareScript = "pwd;cd /tmp;mkdir output;mkdir code;mkdir dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;"
+ CommandPrepareScript = "cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 sync_for_arm uploader_for_grampus;"
CodeArchiveName = "master.zip"
)
@@ -92,6 +92,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
ResourceSpecId: req.ResourceSpecId,
ImageId: req.ImageId,
ImageUrl: req.ImageUrl,
+ ReplicaNum: 0,
},
},
})
diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go
index 183afb853b..bd64ace8f4 100755
--- a/modules/grampus/resty.go
+++ b/modules/grampus/resty.go
@@ -228,7 +228,7 @@ func GetTrainJobLog(jobID string) (string, error) {
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&logContent).
- Get(HOST + urlTrainJob + "/" + jobID + "/log")
+ Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log")
if err != nil {
return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err)
diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini
index c52a369ce4..3166dafb7d 100755
--- a/options/locale/locale_en-US.ini
+++ b/options/locale/locale_en-US.ini
@@ -1170,6 +1170,8 @@ model.manage.sava_model = Sava Model
model.manage.model_manage = ModelManage
model.manage.model_accuracy = Model Accuracy
+grampus.train_job.ai_center = AI Center
+
template.items = Template Items
template.git_content = Git Content (Default Branch)
template.git_hooks = Git Hooks
@@ -3013,6 +3015,9 @@ Platform_Tutorial = Tutorial
foot.advice_feedback = Feedback
[cloudbrain]
+resource_cluster = Resource Cluster
+resource_cluster_openi = OpenI Resource Cluster
+resource_cluster_c2net = China Computing NET
compute_resource = Computing resources
task_name = Task name
task_type = Task type
diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini
index cb1c7565a7..e9b6a52803 100755
--- a/options/locale/locale_zh-CN.ini
+++ b/options/locale/locale_zh-CN.ini
@@ -1180,6 +1180,8 @@ model.manage.sava_model = 保存模型
model.manage.model_manage = 模型管理
model.manage.model_accuracy = 模型精度
+grampus.train_job.ai_center=ai计算中心
+
template.items=模板选项
template.git_content=Git数据(默认分支)
template.git_hooks=Git 钩子
@@ -3023,6 +3025,9 @@ Platform_Tutorial=新手指引
foot.advice_feedback = 意见反馈
[cloudbrain]
+resource_cluster = 算力集群
+resource_cluster_openi = 启智集群
+resource_cluster_c2net = 智算集群
compute_resource = 计算资源
task_name = 任务名称
task_type = 任务类型
diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go
index 471f8be7ef..f6153e811c 100755
--- a/routers/api/v1/api.go
+++ b/routers/api/v1/api.go
@@ -935,9 +935,9 @@ func RegisterRoutes(m *macaron.Macaron) {
})
}, reqRepoReader(models.UnitTypeCloudBrain))
m.Group("/grampus", func() {
- m.Get("/:id", repo.GetCloudbrainTask)
m.Group("/train-job", func() {
m.Group("/:jobid", func() {
+ m.Get("", repo.GetModelArtsTrainJobVersion)
m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
m.Get("/log", repo_ext.GrampusGetLog)
})
diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go
index 69cd249011..525dd07bb3 100755
--- a/routers/repo/cloudbrain.go
+++ b/routers/repo/cloudbrain.go
@@ -1477,6 +1477,9 @@ func SyncCloudbrainStatus() {
}
if result != nil {
+ if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
+ task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
+ }
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
task.Duration = result.JobInfo.RunSec
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go
index 542b1c3865..d25a75c688 100755
--- a/routers/repo/grampus.go
+++ b/routers/repo/grampus.go
@@ -9,7 +9,6 @@ import (
"code.gitea.io/gitea/modules/util"
"encoding/json"
"errors"
- "fmt"
"github.com/unknwon/com"
"io/ioutil"
"net/http"
@@ -458,11 +457,14 @@ func GrampusTrainJobShow(ctx *context.Context) {
result, err := grampus.GetJob(task.JobID)
if err != nil {
log.Error("GetJob failed:" + err.Error())
- ctx.ServerError("GetJob failed", err)
- return
+ //ctx.ServerError("GetJob failed", err)
+ //return
}
if result != nil {
+ if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
+ task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
+ }
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
task.Duration = result.JobInfo.RunSec
@@ -508,6 +510,13 @@ func GrampusTrainJobShow(ctx *context.Context) {
taskList = append(taskList, task)
ctx.Data["version_list_task"] = taskList
+ ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
+
+ aiCenterInfo := strings.Split(task.AiCenter, "+")
+ if len(aiCenterInfo) == 2 {
+ ctx.Data["ai_center"] = aiCenterInfo[1]
+ }
+
ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
}
@@ -553,7 +562,6 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile
toolUnzip = "tar -zxvf "
}
commandUnzip := "cd /tmp/dataset;" + toolUnzip + datasetName + ";cd /tmp/code;unzip -q master.zip;"
- commandUnzip += "cd /tmp/dataset/" + strings.TrimSuffix(datasetName, ".zip") + ";ls;"
command += commandUnzip
//exec code
@@ -579,25 +587,17 @@ func generateCommand(repoName, processorType, codeObsPath, dataObsPath, bootFile
commandCode := "cd /tmp/code/" + repoName + ";python " + bootFile + paramCode + ";"
command += commandCode
+ //get exec result
+ commandGetRes := "result=$?;"
+ command += commandGetRes
+
//upload models
commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_grampus " + setting.Bucket + " " + outputObsPath + " " + "/tmp/output/;"
command += commandUpload
- return command, nil
-}
-
-func generateCommandObsDownloadFile(srcObsFile, dstLocalDir string) (string, error) {
- var command string
-
- command = "python;"
- command += "from modelarts.session import Session \n"
- command += fmt.Sprintf("session = Session(access_key='%s',secret_key='%s', project_id='%s', region_name='%s') \n", setting.AccessKeyID, setting.SecretAccessKey, setting.ProjectID, setting.Location)
-
- if util.IsDir(srcObsFile) {
- command += fmt.Sprintf("session.obs.download_dir(src_obs_dir=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir)
- } else {
- command += fmt.Sprintf("session.obs.download_file(src_obs_file=\"%s\", dst_local_dir=\"%s\") \n", srcObsFile, dstLocalDir)
- }
+ //check exec result
+ commandCheckRes := " [[ result -eq 0 ]] && echo success || ls failed;"
+ command += commandCheckRes
return command, nil
}
diff --git a/routers/routes/routes.go b/routers/routes/routes.go
index 64566a1d7c..ab9f12205a 100755
--- a/routers/routes/routes.go
+++ b/routers/routes/routes.go
@@ -1089,7 +1089,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow)
m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.GrampusStopJob)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.GrampusTrainJobDel)
- m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel)
+ m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload)
})
m.Group("/gpu", func() {
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.GrampusTrainJobGPUNew)
diff --git a/templates/repo/cloudbrain/trainjob/new.tmpl b/templates/repo/cloudbrain/trainjob/new.tmpl
index 39315cfad5..4eff7c21c5 100755
--- a/templates/repo/cloudbrain/trainjob/new.tmpl
+++ b/templates/repo/cloudbrain/trainjob/new.tmpl
@@ -82,6 +82,19 @@
+
+ {{.i18n.Tr "cloudbrain.resource_cluster"}}
+
+
{{.i18n.Tr "cloudbrain.compute_resource"}}