|
- package repo
-
- import (
- "archive/zip"
- "encoding/json"
- "errors"
- "fmt"
- "io"
- "io/ioutil"
- "net/http"
- "os"
- "path"
- "strconv"
- "strings"
- "time"
- "unicode/utf8"
-
- "code.gitea.io/gitea/modules/notification"
- "code.gitea.io/gitea/modules/timeutil"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/auth"
- "code.gitea.io/gitea/modules/base"
- "code.gitea.io/gitea/modules/cloudbrain"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/git"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/modelarts"
- "code.gitea.io/gitea/modules/obs"
- "code.gitea.io/gitea/modules/setting"
- "code.gitea.io/gitea/modules/storage"
- "code.gitea.io/gitea/modules/util"
- )
-
- const (
- tplDebugJobIndex base.TplName = "repo/debugjob/index"
-
- tplModelArtsNotebookIndex base.TplName = "repo/modelarts/notebook/index"
- tplModelArtsNotebookNew base.TplName = "repo/modelarts/notebook/new"
- tplModelArtsNotebookShow base.TplName = "repo/modelarts/notebook/show"
-
- tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index"
- tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
- tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
- tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
-
- tplModelArtsInferenceJobIndex base.TplName = "repo/modelarts/inferencejob/index"
- tplModelArtsInferenceJobNew base.TplName = "repo/modelarts/inferencejob/new"
- tplModelArtsInferenceJobShow base.TplName = "repo/modelarts/inferencejob/show"
- )
-
- func DebugJobIndex(ctx *context.Context) {
- listType := ctx.Query("debugListType")
- ctx.Data["ListType"] = listType
- MustEnableCloudbrain(ctx)
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
- typeCloudBrain := models.TypeCloudBrainAll
- jobTypeNot := false
- if listType == models.GPUResource {
- typeCloudBrain = models.TypeCloudBrainOne
- } else if listType == models.NPUResource {
- typeCloudBrain = models.TypeCloudBrainTwo
- } else if listType == models.AllResource {
- typeCloudBrain = models.TypeCloudBrainAll
- } else {
- log.Error("listType(%s) error", listType)
- ctx.ServerError("listType error", errors.New("listType error"))
- return
- }
-
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeSnn4imagenet), string(models.JobTypeBrainScore), string(models.JobTypeDebug))
- ciTasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: typeCloudBrain,
- JobTypeNot: jobTypeNot,
- JobTypes: jobTypes,
- })
- if err != nil {
- ctx.ServerError("Get debugjob faild:", err)
- return
- }
-
- for i, task := range ciTasks {
- ciTasks[i].CanDebug = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
- ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
- ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource
- }
-
- pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
- pager.AddParam(ctx, "debugListType", "ListType")
- ctx.Data["Page"] = pager
- ctx.Data["PageIsCloudBrain"] = true
- ctx.Data["Tasks"] = ciTasks
- ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
- ctx.Data["RepoIsEmpty"] = repo.IsEmpty
- ctx.Data["debugListType"] = listType
- ctx.HTML(200, tplDebugJobIndex)
- }
-
- // MustEnableDataset check if repository enable internal cb
- func MustEnableModelArts(ctx *context.Context) {
- if !ctx.Repo.CanRead(models.UnitTypeCloudBrain) {
- ctx.NotFound("MustEnableCloudbrain", nil)
- return
- }
- }
-
- func NotebookNew(ctx *context.Context) {
- notebookNewDataPrepare(ctx)
-
- ctx.HTML(200, tplModelArtsNotebookNew)
- }
-
- func notebookNewDataPrepare(ctx *context.Context) error {
- ctx.Data["PageIsCloudBrain"] = true
- t := time.Now()
- var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["display_job_name"] = displayJobName
-
- attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- if modelarts.ImageInfos == nil {
- json.Unmarshal([]byte(setting.ImageInfos), &modelarts.ImageInfos)
- }
- ctx.Data["images"] = modelarts.ImageInfos.ImageInfo
-
- if modelarts.FlavorInfos == nil {
- json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
- }
- ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
-
- ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
-
- return nil
- }
-
- func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
- ctx.Data["PageIsNotebook"] = true
- jobName := form.JobName
- uuid := form.Attachment
- description := form.Description
- flavor := form.Flavor
-
- count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
- if err != nil {
- log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
- cloudBrainNewDataPrepare(ctx)
- ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
- return
- } else {
- if count >= 1 {
- log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
- cloudBrainNewDataPrepare(ctx)
- ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
- return
- }
- }
- _, err = models.GetCloudbrainByName(jobName)
- if err == nil {
- log.Error("the job name did already exist", ctx.Data["MsgID"])
- cloudBrainNewDataPrepare(ctx)
- ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
- return
- } else {
- if !models.IsErrJobNotExist(err) {
- log.Error("system error, %v", err, ctx.Data["MsgID"])
- cloudBrainNewDataPrepare(ctx)
- ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
- return
- }
- }
-
- err = modelarts.GenerateTask(ctx, jobName, uuid, description, flavor)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
- }
-
- func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
- ctx.Data["PageIsNotebook"] = true
- displayJobName := form.DisplayJobName
- jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
- uuid := form.Attachment
- description := form.Description
- flavor := form.Flavor
- imageId := form.ImageId
- repo := ctx.Repo.Repository
-
- count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
- if err != nil {
- log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
- notebookNewDataPrepare(ctx)
- ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
- return
- } else {
- if count >= 1 {
- log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
- notebookNewDataPrepare(ctx)
- ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsNotebookNew, &form)
- return
- }
- }
-
- tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
- if err == nil {
- if len(tasks) != 0 {
- log.Error("the job name did already exist", ctx.Data["MsgID"])
- notebookNewDataPrepare(ctx)
- ctx.RenderWithErr("the job name did already exist", tplModelArtsNotebookNew, &form)
- return
- }
- } else {
- if !models.IsErrJobNotExist(err) {
- log.Error("system error, %v", err, ctx.Data["MsgID"])
- notebookNewDataPrepare(ctx)
- ctx.RenderWithErr("system error", tplModelArtsNotebookNew, &form)
- return
- }
- }
-
- err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId)
- if err != nil {
- log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
- notebookNewDataPrepare(ctx)
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
- }
-
- func NotebookShow(ctx *context.Context) {
- ctx.Data["PageIsCloudBrain"] = true
- debugListType := ctx.Query("debugListType")
-
- var ID = ctx.Params(":id")
- task, err := models.GetCloudbrainByIDWithDeleted(ID)
- if err != nil {
- ctx.Data["error"] = err.Error()
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
- return
- }
-
- result, err := modelarts.GetNotebook2(task.JobID)
- if err != nil {
- ctx.Data["error"] = err.Error()
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
- return
- }
-
- if result != nil {
- if task.DeletedAt.IsZero() { //normal record
- if task.Status != result.Status {
- task.Status = result.Status
- err = models.UpdateJob(task)
- if err != nil {
- ctx.Data["error"] = err.Error()
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
- return
- }
- }
- } else { //deleted record
-
- }
- }
-
- datasetDownloadLink := ""
- if ctx.IsSigned {
- if task.Uuid != "" && task.UserID == ctx.User.ID {
- attachment, err := models.GetAttachmentByUUID(task.Uuid)
- if err == nil {
- task.DatasetName = attachment.Name
- datasetDownloadLink = attachment.S3DownloadURL()
- }
- }
- }
- user, err := models.GetUserByID(task.UserID)
- if err == nil {
- task.User = user
- }
- if modelarts.FlavorInfos == nil {
- json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
- }
- if modelarts.FlavorInfos != nil {
- ctx.Data["resource_spec"] = modelarts.FlavorInfos.FlavorInfo[0].Desc
- for _, f := range modelarts.FlavorInfos.FlavorInfo {
- if fmt.Sprint(f.Value) == task.FlavorCode {
- ctx.Data["resource_spec"] = f.Desc
- break
- }
- }
- }
- if task.TrainJobDuration == "" {
- if task.Duration == 0 {
- var duration int64
- if task.Status == string(models.JobRunning) {
- duration = time.Now().Unix() - int64(task.CreatedUnix)
- } else {
- duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
- }
- task.Duration = duration
- }
- task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
- }
- ctx.Data["duration"] = task.TrainJobDuration
- ctx.Data["datasetDownloadLink"] = datasetDownloadLink
- ctx.Data["task"] = task
- ctx.Data["ID"] = ID
- ctx.Data["jobName"] = task.JobName
- ctx.Data["debugListType"] = debugListType
- ctx.HTML(200, tplModelArtsNotebookShow)
- }
-
- func NotebookDebug(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
-
- result, err := modelarts.GetJob(jobID)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
- return
- }
-
- res, err := modelarts.GetJobToken(jobID)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
- return
- }
-
- urls := strings.Split(result.Spec.Annotations.Url, "/")
- urlPrefix := result.Spec.Annotations.TargetDomain
- for i, url := range urls {
- if i > 2 {
- urlPrefix += "/" + url
- }
- }
-
- debugUrl := urlPrefix + "?token=" + res.Token
- ctx.Redirect(debugUrl)
- }
-
- func NotebookDebug2(ctx *context.Context) {
- task := ctx.Cloudbrain
- result, err := modelarts.GetNotebook2(task.JobID)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplModelArtsNotebookIndex, nil)
- return
- }
-
- ctx.Redirect(result.Url + "?token=" + result.Token)
- }
-
- func NotebookManage(ctx *context.Context) {
- var ID = ctx.Params(":id")
- var action = ctx.Params(":action")
- var resultCode = "0"
- var errorMsg = ""
- var status = ""
-
- for {
- task, err := models.GetCloudbrainByID(ID)
- if err != nil {
- log.Error("get task(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "system error"
- break
- }
-
- if action == models.ActionStop {
- if task.Status != string(models.ModelArtsRunning) {
- log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "the job is not running"
- break
- }
-
- if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin() && !ctx.IsUserRepoOwner()) {
- log.Error("the user has no right ro stop the job", task.JobName, ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "you have no right to stop the job"
- break
- }
- } else if action == models.ActionRestart {
- ctx.CheckWechatBind()
- if ctx.Written() {
- return
- }
- if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
- log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "the job is not stopped"
- break
- }
-
- if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
- log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "you have no right to restart the job"
- break
- }
-
- count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
- if err != nil {
- log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "system error"
- break
- } else {
- if count >= 1 {
- log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "you have already a running or waiting task, can not create more"
- break
- }
- }
-
- action = models.ActionStart
- } else {
- log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "非法操作"
- break
- }
-
- param := models.NotebookAction{
- Action: action,
- }
- createTime := timeutil.TimeStampNow()
- res, err := modelarts.ManageNotebook2(task.JobID, param)
- if err != nil {
- log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = err.Error()
- if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
- errorMsg = "the job's version is too old and can not be restarted"
- }
- break
- }
-
- status = res.Status
- if action == models.ActionStart {
- newTask := &models.Cloudbrain{
- Status: status,
- UserID: task.UserID,
- RepoID: task.RepoID,
- JobID: task.JobID,
- JobName: task.JobName,
- DisplayJobName: task.DisplayJobName,
- JobType: task.JobType,
- Type: task.Type,
- Uuid: task.Uuid,
- Image: task.Image,
- ComputeResource: task.ComputeResource,
- Description: task.Description,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- }
-
- err = models.RestartCloudbrain(task, newTask)
- if err != nil {
- log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "system error"
- break
- }
- ID = strconv.FormatInt(newTask.ID, 10)
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, ID, task.DisplayJobName, models.ActionCreateDebugNPUTask)
- } else {
- task.Status = res.Status
- if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
- task.EndTime = timeutil.TimeStampNow()
- }
- task.ComputeAndSetDuration()
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
- resultCode = "-1"
- errorMsg = "system error"
- break
- }
- }
-
- break
- }
-
- ctx.JSON(200, map[string]string{
- "result_code": resultCode,
- "error_msg": errorMsg,
- "status": status,
- "id": ID,
- })
- }
-
- func NotebookDel(ctx *context.Context) {
- var listType = ctx.Query("debugListType")
- task := ctx.Cloudbrain
-
- if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) {
- log.Error("the job(%s) has not been stopped", task.JobName)
- ctx.RenderWithErr("the job has not been stopped", tplDebugJobIndex, nil)
- return
- }
-
- _, err := modelarts.DelNotebook2(task.JobID)
- if err != nil {
- log.Error("DelNotebook2(%s) failed:%v", task.JobName, err.Error())
- if strings.Contains(err.Error(), modelarts.NotebookNotFound) || strings.Contains(err.Error(), modelarts.NotebookNoPermission) || strings.Contains(err.Error(), modelarts.NotebookInvalid) {
- log.Info("old notebook version")
- } else {
- ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
- return
- }
- }
-
- err = models.DeleteJob(task)
- if err != nil {
- ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
- return
- }
-
- var isAdminPage = ctx.Query("isadminpage")
- if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
- ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
- } else {
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
- }
- }
-
- func TrainJobIndex(ctx *context.Context) {
- MustEnableModelArts(ctx)
-
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
-
- listType := ctx.Query("listType")
- if len(listType) == 0 {
- listType = models.AllResource
- }
- ctx.Data["ListType"] = listType
-
- typeCloudBrain := models.TypeCloudBrainAll
- if listType == models.GPUResource {
- typeCloudBrain = models.TypeCloudBrainOne
- } else if listType == models.NPUResource {
- typeCloudBrain = models.TypeCloudBrainTwo
- } else if listType == models.AllResource {
- typeCloudBrain = models.TypeCloudBrainAll
- }
- //else {
- // log.Error("listType(%s) error", listType)
- // ctx.ServerError("listType error", errors.New("listType error"))
- // return
- //}
-
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeTrain))
- tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: typeCloudBrain,
- JobTypeNot: false,
- JobTypes: jobTypes,
- IsLatestVersion: modelarts.IsLatestVersion,
- })
- if err != nil {
- ctx.ServerError("Cloudbrain", err)
- return
- }
-
- for i, task := range tasks {
- tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
- tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
- if task.Cloudbrain.Type == models.TypeCloudBrainOne {
- tasks[i].ComputeResource = models.GPUResource
- } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
- tasks[i].ComputeResource = models.NPUResource
- }
- }
-
- pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
- pager.SetDefaultParams(ctx)
- pager.AddParam(ctx, "listType", "ListType")
- ctx.Data["Page"] = pager
-
- ctx.Data["PageIsCloudBrain"] = true
- ctx.Data["Tasks"] = tasks
- ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
- ctx.Data["RepoIsEmpty"] = repo.IsEmpty
- ctx.HTML(200, tplModelArtsTrainJobIndex)
- }
-
- func TrainJobNew(ctx *context.Context) {
- err := trainJobNewDataPrepare(ctx)
- if err != nil {
- ctx.ServerError("get new train-job info failed", err)
- return
- }
- ctx.HTML(200, tplModelArtsTrainJobNew)
- }
-
- func trainJobNewDataPrepare(ctx *context.Context) error {
- ctx.Data["PageIsCloudBrain"] = true
-
- //can, err := canUserCreateTrainJob(ctx.User.ID)
- //if err != nil {
- // ctx.ServerError("canUserCreateTrainJob", err)
- // return
- //}
- //
- //if !can {
- // log.Error("the user can not create train-job")
- // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
- // return
- //}
-
- t := time.Now()
- var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["display_job_name"] = displayJobName
-
- attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- ctx.Data["params"] = ""
- ctx.Data["branchName"] = ctx.Repo.BranchName
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
- ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
-
- return nil
- }
-
- func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
- ctx.Data["PageIsCloudBrain"] = true
-
- //can, err := canUserCreateTrainJob(ctx.User.ID)
- //if err != nil {
- // ctx.ServerError("canUserCreateTrainJob", err)
- // return
- //}
- //
- //if !can {
- // log.Error("the user can not create train-job")
- // ctx.ServerError("the user can not create train-job", fmt.Errorf("the user can not create train-job"))
- // return
- //}
-
- t := time.Now()
- var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["display_job_name"] = displayJobName
-
- attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- var Parameters modelarts.Parameters
- if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["params"] = Parameters.Parameter
- ctx.Data["config_list"] = configList.ParaConfigs
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["branch_name"] = form.BranchName
-
- return nil
- }
-
- func TrainJobNewVersion(ctx *context.Context) {
-
- err := trainJobNewVersionDataPrepare(ctx)
- if err != nil {
- ctx.ServerError("get new train-job info failed", err)
- return
- }
- ctx.HTML(200, tplModelArtsTrainJobVersionNew)
- }
-
- func trainJobNewVersionDataPrepare(ctx *context.Context) error {
- ctx.Data["PageIsCloudBrain"] = true
- var jobID = ctx.Params(":jobid")
- var versionName = ctx.Query("version_name")
-
- // canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
- // if err != nil {
- // ctx.ServerError("canNewJob can info failed", err)
- // return err
- // }
-
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
- if err != nil {
- log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
- return err
- }
-
- ctx.Data["display_job_name"] = task.DisplayJobName
- ctx.Data["job_name"] = task.JobName
-
- attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- var Parameters modelarts.Parameters
- if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["params"] = Parameters.Parameter
-
- branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
- if err != nil {
- ctx.ServerError("GetBranches error:", err)
- return err
- }
-
- ctx.Data["branches"] = branches
- ctx.Data["branch_name"] = task.BranchName
- ctx.Data["description"] = task.Description
- ctx.Data["boot_file"] = task.BootFile
- ctx.Data["dataset_name"] = task.DatasetName
- ctx.Data["work_server_number"] = task.WorkServerNumber
- ctx.Data["flavor_name"] = task.FlavorName
- ctx.Data["engine_name"] = task.EngineName
- ctx.Data["uuid"] = task.Uuid
- ctx.Data["flavor_code"] = task.FlavorCode
- ctx.Data["engine_id"] = task.EngineID
- ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
-
- return nil
- }
-
- func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
- ctx.Data["PageIsCloudBrain"] = true
- var jobID = ctx.Params(":jobid")
- // var versionName = ctx.Params(":version-name")
- var versionName = ctx.Query("version_name")
-
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
- if err != nil {
- log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
- return err
- }
-
- t := time.Now()
- var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["job_name"] = task.JobName
-
- attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- var Parameters modelarts.Parameters
- if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["params"] = Parameters.Parameter
-
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
- ctx.Data["train_url"] = outputObsPath
-
- branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
- if err != nil {
- ctx.ServerError("GetBranches error:", err)
- return err
- }
- ctx.Data["branches"] = branches
- ctx.Data["description"] = form.Description
- ctx.Data["dataset_name"] = task.DatasetName
- ctx.Data["work_server_number"] = form.WorkServerNumber
- ctx.Data["flavor_name"] = form.FlavorName
- ctx.Data["engine_name"] = form.EngineName
- ctx.Data["flavor_code"] = task.FlavorCode
- ctx.Data["engine_id"] = task.EngineID
- ctx.Data["version_name"] = form.VersionName
-
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["branch_name"] = form.BranchName
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
-
- return nil
- }
-
- func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
- ctx.Data["PageIsTrainJob"] = true
- VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
- displayJobName := form.DisplayJobName
- jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
- uuid := form.Attachment
- description := form.Description
- workServerNumber := form.WorkServerNumber
- engineID := form.EngineID
- bootFile := form.BootFile
- flavorCode := form.Flavor
- params := form.Params
- poolID := form.PoolID
- isSaveParam := form.IsSaveParam
- repo := ctx.Repo.Repository
- codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
- codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
- logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
- dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
- branch_name := form.BranchName
- isLatestVersion := modelarts.IsLatestVersion
- FlavorName := form.FlavorName
- VersionCount := modelarts.VersionCount
- EngineName := form.EngineName
-
- count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
- if err != nil {
- log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
- return
- } else {
- if count >= 1 {
- log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobNew, &form)
- return
- }
- }
-
- if err := paramCheckCreateTrainJob(form); err != nil {
- log.Error("paramCheckCreateTrainJob failed:(%v)", err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
- //Determine whether the task name of the task in the project is duplicated
- tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
- if err == nil {
- if len(tasks) != 0 {
- log.Error("the job name did already exist", ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("the job name did already exist", tplModelArtsTrainJobNew, &form)
- return
- }
- } else {
- if !models.IsErrJobNotExist(err) {
- log.Error("system error, %v", err, ctx.Data["MsgID"])
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsTrainJobNew, &form)
- return
- }
- }
-
- //todo: del the codeLocalPath
- _, err = ioutil.ReadDir(codeLocalPath)
- if err == nil {
- os.RemoveAll(codeLocalPath)
- }
-
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branch_name)
-
- if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
- Branch: branch_name,
- }); err != nil {
- log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsTrainJobNew, &form)
- return
- }
-
- //todo: upload code (send to file_server todo this work?)
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
- return
- }
-
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
- return
- }
-
- // parentDir := VersionOutputPath + "/"
- if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
- // if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
- log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
- return
- }
-
- var parameters models.Parameters
- param := make([]models.Parameter, 0)
- existDeviceTarget := false
- if len(params) != 0 {
- err := json.Unmarshal([]byte(params), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
- return
- }
-
- for _, parameter := range parameters.Parameter {
- if parameter.Label == modelarts.DeviceTarget {
- existDeviceTarget = true
- }
- if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
- }
- }
- if !existDeviceTarget {
- param = append(param, models.Parameter{
- Label: modelarts.DeviceTarget,
- Value: modelarts.Ascend,
- })
- }
-
- //save param config
- if isSaveParam == "on" {
- saveparams := append(param, models.Parameter{
- Label: modelarts.TrainUrl,
- Value: outputObsPath,
- }, models.Parameter{
- Label: modelarts.DataUrl,
- Value: dataPath,
- })
- if form.ParameterTemplateName == "" {
- log.Error("ParameterTemplateName is empty")
- trainJobNewDataPrepare(ctx)
- ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
- return
- }
-
- _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
- ConfigName: form.ParameterTemplateName,
- Description: form.PrameterDescription,
- DataUrl: dataPath,
- AppUrl: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- TrainUrl: outputObsPath,
- Flavor: models.Flavor{
- Code: flavorCode,
- },
- WorkServerNum: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Parameter: saveparams,
- })
-
- if err != nil {
- log.Error("Failed to CreateTrainJobConfig: %v", err)
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
- }
-
- req := &modelarts.GenerateTrainJobReq{
- JobName: jobName,
- DisplayJobName: displayJobName,
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- TrainUrl: outputObsPath,
- FlavorCode: flavorCode,
- WorkServerNumber: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Uuid: uuid,
- Parameters: param,
- CommitID: commitID,
- IsLatestVersion: isLatestVersion,
- BranchName: branch_name,
- Params: form.Params,
- FlavorName: FlavorName,
- EngineName: EngineName,
- VersionCount: VersionCount,
- TotalVersionCount: modelarts.TotalVersionCount,
- }
-
- //将params转换Parameters.Parameter,出错时返回给前端
- var Parameters modelarts.Parameters
- if err := json.Unmarshal([]byte(params), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return
- }
-
- err = modelarts.GenerateTrainJob(ctx, req)
- if err != nil {
- log.Error("GenerateTrainJob failed:%v", err.Error())
- trainJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
- }
-
- func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
- ctx.Data["PageIsTrainJob"] = true
- var jobID = ctx.Params(":jobid")
-
- count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
- if err != nil {
- log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsTrainJobVersionNew, &form)
- return
- } else {
- if count >= 1 {
- log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplModelArtsTrainJobVersionNew, &form)
- return
- }
- }
-
- latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion)
- if err != nil {
- ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
- return
- }
- VersionOutputPath := modelarts.GetOutputPathByCount(latestTask.TotalVersionCount + 1)
-
- displayJobName := form.DisplayJobName
- jobName := form.JobName
- uuid := form.Attachment
- description := form.Description
- workServerNumber := form.WorkServerNumber
- engineID := form.EngineID
- bootFile := form.BootFile
- flavorCode := form.Flavor
- params := form.Params
- poolID := form.PoolID
- isSaveParam := form.IsSaveParam
- repo := ctx.Repo.Repository
- codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
- codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + VersionOutputPath + "/"
- outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/"
- logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
- dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
- branch_name := form.BranchName
- PreVersionName := form.VersionName
- FlavorName := form.FlavorName
- EngineName := form.EngineName
- isLatestVersion := modelarts.IsLatestVersion
-
- canNewJob, _ := canUserCreateTrainJobVersion(ctx, latestTask.UserID)
- if !canNewJob {
- ctx.RenderWithErr("user cann't new trainjob", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- if err := paramCheckCreateTrainJob(form); err != nil {
- log.Error("paramCheckCreateTrainJob failed:(%v)", err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- //todo: del the codeLocalPath
- _, err = ioutil.ReadDir(codeLocalPath)
- if err == nil {
- os.RemoveAll(codeLocalPath)
- }
-
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branch_name)
- if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
- Branch: branch_name,
- }); err != nil {
- log.Error("Failed git clone repo to local(!: %s (%v)", repo.FullName(), err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed git clone repo to local!", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- //todo: upload code (send to file_server todo this work?)
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- parentDir := VersionOutputPath + "/"
- // parentDir := ""
- // if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
- if err := uploadCodeToObs(codeLocalPath, jobName, parentDir); err != nil {
- log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- //todo: del local code?
-
- var parameters models.Parameters
- param := make([]models.Parameter, 0)
- existDeviceTarget := true
- if len(params) != 0 {
- err := json.Unmarshal([]byte(params), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
- return
- }
- for _, parameter := range parameters.Parameter {
- if parameter.Label == modelarts.DeviceTarget {
- existDeviceTarget = true
- }
- if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
- }
- }
- if !existDeviceTarget {
- param = append(param, models.Parameter{
- Label: modelarts.DeviceTarget,
- Value: modelarts.Ascend,
- })
- }
-
- //save param config
- if isSaveParam == "on" {
- saveparams := append(param, models.Parameter{
- Label: modelarts.TrainUrl,
- Value: outputObsPath,
- }, models.Parameter{
- Label: modelarts.DataUrl,
- Value: dataPath,
- })
- if form.ParameterTemplateName == "" {
- log.Error("ParameterTemplateName is empty")
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
- return
- }
-
- _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
- ConfigName: form.ParameterTemplateName,
- Description: form.PrameterDescription,
- DataUrl: dataPath,
- AppUrl: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- TrainUrl: outputObsPath,
- Flavor: models.Flavor{
- Code: flavorCode,
- },
- WorkServerNum: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Parameter: saveparams,
- })
-
- if err != nil {
- log.Error("Failed to CreateTrainJobConfig: %v", err)
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
- }
-
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName)
- if err != nil {
- log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
- req := &modelarts.GenerateTrainJobReq{
- JobName: jobName,
- DisplayJobName: displayJobName,
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- TrainUrl: outputObsPath,
- FlavorCode: flavorCode,
- WorkServerNumber: workServerNumber,
- IsLatestVersion: isLatestVersion,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Uuid: uuid,
- Params: form.Params,
- Parameters: param,
- PreVersionId: task.VersionID,
- CommitID: commitID,
- BranchName: branch_name,
- FlavorName: FlavorName,
- EngineName: EngineName,
- PreVersionName: PreVersionName,
- TotalVersionCount: latestTask.TotalVersionCount + 1,
- }
-
- err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
- if err != nil {
- log.Error("GenerateTrainJob failed:%v", err.Error())
- versionErrorDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job/" + jobID)
- // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- // readDir reads the directory named by dirname and returns
- // a list of directory entries sorted by filename.
- func readDir(dirname string) ([]os.FileInfo, error) {
- f, err := os.Open(dirname)
- if err != nil {
- return nil, err
- }
-
- list, err := f.Readdir(100)
- f.Close()
- if err != nil {
- //todo: can not upload empty folder
- if err == io.EOF {
- return nil, nil
- }
- return nil, err
- }
-
- //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
- return list, nil
- }
-
- func uploadCodeToObs(codePath, jobName, parentDir string) error {
- files, err := readDir(codePath)
- if err != nil {
- log.Error("readDir(%s) failed: %s", codePath, err.Error())
- return err
- }
-
- for _, file := range files {
- if file.IsDir() {
- input := &obs.PutObjectInput{}
- input.Bucket = setting.Bucket
- input.Key = parentDir + file.Name() + "/"
- _, err = storage.ObsCli.PutObject(input)
- if err != nil {
- log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
- return err
- }
-
- if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
- log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
- return err
- }
- } else {
- input := &obs.PutFileInput{}
- input.Bucket = setting.Bucket
- input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
- input.SourceFile = codePath + file.Name()
- _, err = storage.ObsCli.PutFile(input)
- if err != nil {
- log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
- return err
- }
- }
- }
-
- return nil
- }
-
- func obsMkdir(dir string) error {
- input := &obs.PutObjectInput{}
- input.Bucket = setting.Bucket
- input.Key = dir
- _, err := storage.ObsCli.PutObject(input)
- if err != nil {
- log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
- return err
- }
-
- return nil
- }
-
- func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error {
- if !strings.HasSuffix(form.BootFile, ".py") {
- log.Error("the boot file(%s) must be a python file", form.BootFile)
- return errors.New("启动文件必须是python文件")
- }
-
- if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
- log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
- return errors.New("计算节点数必须在1-25之间")
- }
- if form.BranchName == "" {
- log.Error("the branch must not be null!", form.BranchName)
- return errors.New("代码分支不能为空!")
- }
-
- return nil
- }
-
- func paramCheckCreateInferenceJob(form auth.CreateModelArtsInferenceJobForm) error {
- if !strings.HasSuffix(form.BootFile, ".py") {
- log.Error("the boot file(%s) must be a python file", form.BootFile)
- return errors.New("启动文件必须是python文件")
- }
-
- if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 {
- log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber)
- return errors.New("计算节点数必须在1-25之间")
- }
-
- if form.ModelName == "" {
- log.Error("the ModelName(%d) must not be nil", form.ModelName)
- return errors.New("模型名称不能为空")
- }
- if form.ModelVersion == "" {
- log.Error("the ModelVersion(%d) must not be nil", form.ModelVersion)
- return errors.New("模型版本不能为空")
- }
- if form.CkptName == "" {
- log.Error("the CkptName(%d) must not be nil", form.CkptName)
- return errors.New("权重文件不能为空")
- }
- if form.BranchName == "" {
- log.Error("the Branch(%d) must not be nil", form.BranchName)
- return errors.New("分支名不能为空")
- }
-
- if utf8.RuneCountInString(form.Description) > 255 {
- log.Error("the Description length(%d) must not more than 255", form.Description)
- return errors.New("描述字符不能超过255个字符")
- }
-
- return nil
- }
-
- func TrainJobShow(ctx *context.Context) {
- ctx.Data["PageIsCloudBrain"] = true
- var jobID = ctx.Params(":jobid")
-
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
-
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeTrain))
- VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobTypes: jobTypes,
- JobID: jobID,
- })
-
- if err != nil {
- log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
- //设置权限
- canNewJob, err := canUserCreateTrainJobVersion(ctx, VersionListTasks[0].UserID)
- if err != nil {
- ctx.ServerError("canNewJob failed", err)
- return
- }
- ctx.Data["canNewJob"] = canNewJob
-
- //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
- for i, task := range VersionListTasks {
-
- var parameters models.Parameters
-
- err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err)
- trainJobNewDataPrepare(ctx)
- return
- }
-
- if len(parameters.Parameter) > 0 {
- paramTemp := ""
- for _, Parameter := range parameters.Parameter {
- param := Parameter.Label + " = " + Parameter.Value + "; "
- paramTemp = paramTemp + param
- }
- VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2]
- } else {
- VersionListTasks[i].Parameters = ""
- }
-
- VersionListTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
- VersionListTasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
- }
-
- pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
- pager.SetDefaultParams(ctx)
- ctx.Data["Page"] = pager
- ctx.Data["jobID"] = jobID
- ctx.Data["displayJobName"] = VersionListTasks[0].DisplayJobName
- ctx.Data["version_list_task"] = VersionListTasks
- ctx.Data["version_list_count"] = VersionListCount
- ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, &VersionListTasks[0].Cloudbrain)
- ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- func TrainJobGetLog(ctx *context.Context) {
- ctx.Data["PageIsTrainJob"] = true
-
- var jobID = ctx.Params(":jobid")
- var logFileName = ctx.Query("file_name")
- var baseLine = ctx.Query("base_line")
- var order = ctx.Query("order")
-
- if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
- log.Error("order(%s) check failed", order)
- ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
- return
- }
-
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
- if err != nil {
- log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- ctx.Data["log"] = result
- //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
- return nil, nil, err
- }
-
- resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
- if err != nil {
- log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
- return nil, nil, err
- }
-
- result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
- if err != nil {
- log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
- return nil, nil, err
- }
-
- return resultLogFile, result, err
- }
-
- func TrainJobDel(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- var listType = ctx.Query("listType")
- repo := ctx.Repo.Repository
-
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeTrain))
- VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobTypes: jobTypes,
- JobID: jobID,
- })
- if err != nil {
- ctx.ServerError("get VersionListTasks failed", err)
- return
- }
-
- //删除modelarts上的任务记录
- _, err = modelarts.DelTrainJob(jobID)
- if err != nil {
- log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
- return
- }
-
- //删除数据库Cloudbrain表的记录
- for _, task := range VersionListTasks {
- err = models.DeleteJob(&task.Cloudbrain)
- if err != nil {
- ctx.ServerError("DeleteJob failed", err)
- return
- }
- }
-
- //删除存储
- if len(VersionListTasks) > 0 {
- DeleteJobStorage(VersionListTasks[0].JobName)
- }
-
- var isAdminPage = ctx.Query("isadminpage")
- if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
- ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
- } else {
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
- }
- }
-
- func TrainJobStop(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- var listType = ctx.Query("listType")
- task := ctx.Cloudbrain
-
- _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
- if err != nil {
- log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
- return
- }
-
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
- }
-
- func canUserCreateTrainJob(uid int64) (bool, error) {
- org, err := models.GetOrgByName(setting.AllowedOrg)
- if err != nil {
- log.Error("get allowed org failed: ", setting.AllowedOrg)
- return false, err
- }
-
- return org.IsOrgMember(uid)
- }
- func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
- if ctx == nil || ctx.User == nil {
- log.Error("user unlogin!")
- return false, nil
- }
- if userID == ctx.User.ID || ctx.User.IsAdmin {
- return true, nil
- } else {
- log.Error("Only user itself and admin can new trainjob!")
- return false, nil
- }
- }
-
- func TrainJobGetConfigList(ctx *context.Context) {
- ctx.Data["PageIsTrainJob"] = true
-
- var jobID = ctx.Params(":jobid")
- var logFileName = ctx.Query("file_name")
- var baseLine = ctx.Query("base_line")
- var order = ctx.Query("order")
-
- if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
- log.Error("order(%s) check failed", order)
- ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
- return
- }
-
- task, err := models.GetCloudbrainByJobID(jobID)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
- if err != nil {
- log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
- return
- }
-
- ctx.Data["log"] = result
- //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
- }
-
- func getConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
- var result models.GetConfigListResult
-
- list, err := modelarts.GetConfigList(perPage, page, sortBy, order, searchContent, configType)
- if err != nil {
- log.Error("GetConfigList failed:", err)
- return &result, err
- }
-
- for _, config := range list.ParaConfigs {
- paraConfig, err := modelarts.GetParaConfig(config.ConfigName, configType)
- if err != nil {
- log.Error("GetParaConfig failed:", err)
- return &result, err
- }
-
- config.Result = paraConfig
- }
-
- return list, nil
- }
-
- func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) {
- ctx.Data["PageIsTrainJob"] = true
- VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
- displayJobName := form.DisplayJobName
- jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
- uuid := form.Attachment
- description := form.Description
- workServerNumber := form.WorkServerNumber
- engineID := form.EngineID
- bootFile := form.BootFile
- flavorCode := form.Flavor
- params := form.Params
- poolID := form.PoolID
- repo := ctx.Repo.Repository
- codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
- codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
- resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
- logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
- dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
- branch_name := form.BranchName
- FlavorName := form.FlavorName
- EngineName := form.EngineName
- LabelName := form.LabelName
- isLatestVersion := modelarts.IsLatestVersion
- VersionCount := modelarts.VersionCount
- trainUrl := form.TrainUrl
- modelName := form.ModelName
- modelVersion := form.ModelVersion
- ckptName := form.CkptName
-
- ckptUrl := form.TrainUrl + form.CkptName
-
- count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID)
- if err != nil {
- log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
- return
- } else {
- if count >= 1 {
- log.Error("the user already has running or waiting inference task", ctx.Data["MsgID"])
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("you have already a running or waiting inference task, can not create more", tplModelArtsInferenceJobNew, &form)
- return
- }
- }
-
- if err := paramCheckCreateInferenceJob(form); err != nil {
- log.Error("paramCheckCreateInferenceJob failed:(%v)", err)
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
- return
- }
-
- //Determine whether the task name of the task in the project is duplicated
- tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeInference), displayJobName)
- if err == nil {
- if len(tasks) != 0 {
- log.Error("the job name did already exist", ctx.Data["MsgID"])
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("the job name did already exist", tplModelArtsInferenceJobNew, &form)
- return
- }
- } else {
- if !models.IsErrJobNotExist(err) {
- log.Error("system error, %v", err, ctx.Data["MsgID"])
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("system error", tplModelArtsInferenceJobNew, &form)
- return
- }
- }
-
- //todo: del the codeLocalPath
- _, err = ioutil.ReadDir(codeLocalPath)
- if err == nil {
- os.RemoveAll(codeLocalPath)
- }
-
- gitRepo, _ := git.OpenRepository(repo.RepoPath())
- commitID, _ := gitRepo.GetBranchCommitID(branch_name)
-
- if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
- Branch: branch_name,
- }); err != nil {
- log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Create task failed, server timed out", tplModelArtsInferenceJobNew, &form)
- return
- }
-
- //todo: upload code (send to file_server todo this work?)
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_result", tplModelArtsInferenceJobNew, &form)
- return
- }
-
- if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
- log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsInferenceJobNew, &form)
- return
- }
-
- if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
- log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsInferenceJobNew, &form)
- return
- }
-
- var parameters models.Parameters
- param := make([]models.Parameter, 0)
- param = append(param, models.Parameter{
- Label: modelarts.ResultUrl,
- Value: "s3:/" + resultObsPath,
- }, models.Parameter{
- Label: modelarts.CkptUrl,
- Value: "s3:/" + ckptUrl,
- })
- existDeviceTarget := false
- if len(params) != 0 {
- err := json.Unmarshal([]byte(params), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal params: %s (%v)", params, err)
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr("运行参数错误", tplModelArtsInferenceJobNew, &form)
- return
- }
-
- for _, parameter := range parameters.Parameter {
- if parameter.Label == modelarts.DeviceTarget {
- existDeviceTarget = true
- }
- if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
- param = append(param, models.Parameter{
- Label: parameter.Label,
- Value: parameter.Value,
- })
- }
- }
- }
- if !existDeviceTarget {
- param = append(param, models.Parameter{
- Label: modelarts.DeviceTarget,
- Value: modelarts.Ascend,
- })
- }
-
- req := &modelarts.GenerateInferenceJobReq{
- JobName: jobName,
- DisplayJobName: displayJobName,
- DataUrl: dataPath,
- Description: description,
- CodeObsPath: codeObsPath,
- BootFileUrl: codeObsPath + bootFile,
- BootFile: bootFile,
- TrainUrl: trainUrl,
- FlavorCode: flavorCode,
- WorkServerNumber: workServerNumber,
- EngineID: int64(engineID),
- LogUrl: logObsPath,
- PoolID: poolID,
- Uuid: uuid,
- Parameters: param, //modelarts train parameters
- CommitID: commitID,
- BranchName: branch_name,
- Params: form.Params,
- FlavorName: FlavorName,
- EngineName: EngineName,
- LabelName: LabelName,
- IsLatestVersion: isLatestVersion,
- VersionCount: VersionCount,
- TotalVersionCount: modelarts.TotalVersionCount,
- ModelName: modelName,
- ModelVersion: modelVersion,
- CkptName: ckptName,
- ResultUrl: resultObsPath,
- }
-
- err = modelarts.GenerateInferenceJob(ctx, req)
- if err != nil {
- log.Error("GenerateTrainJob failed:%v", err.Error())
- inferenceJobErrorNewDataPrepare(ctx, form)
- ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobNew, &form)
- return
- }
- ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")
- }
- func InferenceJobIndex(ctx *context.Context) {
- MustEnableModelArts(ctx)
-
- repo := ctx.Repo.Repository
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
-
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeInference))
- tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{
- ListOptions: models.ListOptions{
- Page: page,
- PageSize: setting.UI.IssuePagingNum,
- },
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobTypes: jobTypes,
- })
- if err != nil {
- ctx.ServerError("Cloudbrain", err)
- return
- }
-
- for i, task := range tasks {
- tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
- tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
- tasks[i].ComputeResource = models.NPUResource
- }
-
- repoId := ctx.Repo.Repository.ID
- Type := -1
- _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
- ListOptions: models.ListOptions{
- Page: 1,
- PageSize: 2,
- },
- RepoID: repoId,
- Type: Type,
- New: MODEL_LATEST,
- })
- ctx.Data["MODEL_COUNT"] = model_count
-
- pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
- pager.SetDefaultParams(ctx)
- ctx.Data["Page"] = pager
-
- ctx.Data["PageIsCloudBrain"] = true
- ctx.Data["Tasks"] = tasks
- ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx)
- ctx.Data["RepoIsEmpty"] = repo.IsEmpty
- ctx.HTML(200, tplModelArtsInferenceJobIndex)
- }
- func InferenceJobNew(ctx *context.Context) {
- err := inferenceJobNewDataPrepare(ctx)
- if err != nil {
- ctx.ServerError("get new inference-job info failed", err)
- return
- }
- ctx.HTML(200, tplModelArtsInferenceJobNew)
- }
- func inferenceJobNewDataPrepare(ctx *context.Context) error {
- ctx.Data["PageIsCloudBrain"] = true
-
- t := time.Now()
- var displayJobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["display_job_name"] = displayJobName
-
- attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
-
- ctx.Data["flavor_infos"] = flavorInfos.Info
- ctx.Data["params"] = ""
- ctx.Data["branchName"] = ctx.Repo.BranchName
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- ctx.Data["config_list"] = configList.ParaConfigs
-
- repoId := ctx.Repo.Repository.ID
- Type := -1
- _, model_count, _ := models.QueryModel(&models.AiModelQueryOptions{
- ListOptions: models.ListOptions{
- Page: 1,
- PageSize: 2,
- },
- RepoID: repoId,
- Type: Type,
- New: MODEL_LATEST,
- })
- ctx.Data["MODEL_COUNT"] = model_count
- ctx.Data["cloudbraintype"] = models.TypeCloudBrainTwo
-
- return nil
- }
-
- func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
- ctx.Data["PageIsCloudBrain"] = true
-
- t := time.Now()
- var jobName = "inference" + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
- ctx.Data["job_name"] = jobName
-
- attachs, err := models.GetModelArtsTrainAttachments(ctx.User.ID)
- if err != nil {
- ctx.ServerError("GetAllUserAttachments failed:", err)
- return err
- }
- ctx.Data["attachments"] = attachs
-
- var resourcePools modelarts.ResourcePool
- if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["resource_pools"] = resourcePools.Info
-
- var engines modelarts.Engine
- if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engines"] = engines.Info
-
- var versionInfos modelarts.VersionInfo
- if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["engine_versions"] = versionInfos.Version
-
- var flavorInfos modelarts.Flavor
- if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["flavor_infos"] = flavorInfos.Info
-
- configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
- if err != nil {
- ctx.ServerError("getConfigList failed:", err)
- return err
- }
- var Parameters modelarts.Parameters
- if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
- ctx.ServerError("json.Unmarshal failed:", err)
- return err
- }
- ctx.Data["params"] = Parameters.Parameter
- ctx.Data["config_list"] = configList.ParaConfigs
- ctx.Data["bootFile"] = form.BootFile
- ctx.Data["uuid"] = form.Attachment
- ctx.Data["branch_name"] = form.BranchName
- ctx.Data["model_name"] = form.ModelName
- ctx.Data["model_version"] = form.ModelVersion
- ctx.Data["ckpt_name"] = form.CkptName
- ctx.Data["train_url"] = form.TrainUrl
-
- return nil
- }
- func InferenceJobShow(ctx *context.Context) {
- ctx.Data["PageIsCloudBrain"] = true
- var jobID = ctx.Params(":jobid")
-
- page := ctx.QueryInt("page")
- if page <= 0 {
- page = 1
- }
- task, err := models.GetCloudbrainByJobID(jobID)
-
- if err != nil {
- log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
- ctx.RenderWithErr(err.Error(), tplModelArtsInferenceJobShow, nil)
- return
- }
- //设置权限
- canNewJob, err := canUserCreateTrainJobVersion(ctx, task.UserID)
- if err != nil {
- ctx.ServerError("canNewJob failed", err)
- return
- }
- ctx.Data["canNewJob"] = canNewJob
-
- //将运行参数转化为epoch_size = 3, device_target = Ascend的格式
- var parameters models.Parameters
- err = json.Unmarshal([]byte(task.Parameters), ¶meters)
- if err != nil {
- log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
- trainJobNewDataPrepare(ctx)
- return
- }
-
- if len(parameters.Parameter) > 0 {
- paramTemp := ""
- for _, Parameter := range parameters.Parameter {
- param := Parameter.Label + " = " + Parameter.Value + "; "
- paramTemp = paramTemp + param
- }
- task.Parameters = paramTemp[:len(paramTemp)-2]
- } else {
- task.Parameters = ""
- }
-
- LabelName := strings.Fields(task.LabelName)
- ctx.Data["labelName"] = LabelName
- ctx.Data["jobID"] = jobID
- ctx.Data["jobName"] = task.JobName
- ctx.Data["displayJobName"] = task.DisplayJobName
- ctx.Data["task"] = task
- ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
-
- tempUids := []int64{}
- tempUids = append(tempUids, task.UserID)
- JobCreater, err := models.GetUserNamesByIDs(tempUids)
- if err != nil {
- log.Error("GetUserNamesByIDs (WhitelistUserIDs): %v", err)
- }
- ctx.Data["userName"] = JobCreater[0]
- ctx.HTML(http.StatusOK, tplModelArtsInferenceJobShow)
- }
-
- func ModelDownload(ctx *context.Context) {
- var (
- err error
- )
-
- var jobID = ctx.Params(":jobid")
- versionName := ctx.Query("version_name")
- parentDir := ctx.Query("parent_dir")
- fileName := ctx.Query("file_name")
- log.Info("DownloadSingleModelFile start.")
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
- return
- }
-
- path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName, parentDir, fileName), "/")
- log.Info("Download path is:%s", path)
-
- url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
- if err != nil {
- log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
- ctx.ServerError("GetObsCreateSignedUrl", err)
- return
- }
- http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
- }
-
- func ResultDownload(ctx *context.Context) {
- var (
- err error
- )
-
- versionName := ctx.Query("version_name")
- parentDir := ctx.Query("parent_dir")
- fileName := ctx.Query("file_name")
- log.Info("DownloadResult start.")
- task := ctx.Cloudbrain
- if err != nil {
- ctx.Data["error"] = err.Error()
- }
- path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName, parentDir, fileName), "/")
- log.Info("Download path is:%s", path)
-
- url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, path)
- if err != nil {
- log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
- ctx.ServerError("GetObsCreateSignedUrl", err)
- return
- }
- http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently)
- }
- func DeleteJobStorage(jobName string) error {
- //delete local
- localJobPath := setting.JobPath + jobName
- err := os.RemoveAll(localJobPath)
- if err != nil {
- log.Error("RemoveAll(%s) failed:%v", localJobPath, err)
- }
-
- //delete oss
- dirPath := setting.CodePathPrefix + jobName + "/"
- err = storage.ObsRemoveObject(setting.Bucket, dirPath)
- if err != nil {
- log.Error("ObsRemoveObject(%s) failed:%v", localJobPath, err)
- }
-
- return nil
- }
-
- func DownloadMultiResultFile(ctx *context.Context) {
- var jobID = ctx.Params(":jobid")
- var versionName = ctx.Query("version_name")
- task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
- if err != nil {
- log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
- return
- }
- // if !isCanDeleteOrDownload(ctx, task) {
- // ctx.ServerError("no right.", errors.New(ctx.Tr("repo.model_noright")))
- // return
- // }
-
- // path := Model_prefix + models.AttachmentRelativePath(id) + "/"
- path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName), "/") + "/"
-
- allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
- if err == nil {
- //count++
- // models.ModifyModelDownloadCount(id)
-
- returnFileName := task.DisplayJobName + ".zip"
- ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+returnFileName)
- ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
- w := zip.NewWriter(ctx.Resp)
- defer w.Close()
- for _, oneFile := range allFile {
- if oneFile.IsDir {
- log.Info("zip dir name:" + oneFile.FileName)
- } else {
- log.Info("zip file name:" + oneFile.FileName)
- fDest, err := w.Create(oneFile.FileName)
- if err != nil {
- log.Info("create zip entry error, download file failed: %s\n", err.Error())
- ctx.ServerError("download file failed:", err)
- return
- }
- body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
- if err != nil {
- log.Info("download file failed: %s\n", err.Error())
- ctx.ServerError("download file failed:", err)
- return
- } else {
- defer body.Close()
- p := make([]byte, 1024)
- var readErr error
- var readCount int
- // 读取对象内容
- for {
- readCount, readErr = body.Read(p)
- if readCount > 0 {
- fDest.Write(p[:readCount])
- }
- if readErr != nil {
- break
- }
- }
- }
- }
- }
- } else {
- log.Info("error,msg=" + err.Error())
- ctx.ServerError("no file to download.", err)
- }
- }
-
- func SetJobCount(ctx *context.Context) {
- repoId := ctx.Repo.Repository.ID
- _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{
- RepoID: repoId,
- Type: models.TypeCloudBrainAll,
- })
- if err != nil {
- ctx.ServerError("Get job faild:", err)
- return
- }
- ctx.Data["jobCount"] = jobCount
- }
|