@@ -15,6 +15,8 @@ import (
"time"
"unicode/utf8"
"code.gitea.io/gitea/modules/notification"
"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/timeutil"
@@ -222,6 +224,7 @@ func CloudBrainNew(ctx *context.Context) {
ctx.ServerError("get new cloudbrain info failed", err)
return
}
ctx.Data["PageIsGPUDebug"] = true
ctx.HTML(200, tplCloudBrainNew)
}
@@ -287,13 +290,17 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
return
}
}
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuids)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
var datasetInfos map[string]models.DatasetInfo
var datasetNames string
//var
if uuids != "" {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuids)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
}
command := cloudbrain.GetCloudbrainDebugCommand()
@@ -302,6 +309,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
commandTrain, err := getTrainJobCommand(form)
if err != nil {
log.Error("getTrainJobCommand failed: %v", err)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
@@ -370,7 +378,6 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}
func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) {
ctx.Data["PageIsCloudBrain"] = true
displayJobName := form.DisplayJobName
@@ -391,6 +398,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
command, err := getInferenceJobCommand(form)
if err != nil {
log.Error("getTrainJobCommand failed: %v", err)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
@@ -413,6 +421,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
}
if !jobNamePattern.MatchString(displayJobName) {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}
@@ -491,6 +500,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")
}
/**
检查用户传输的参数是否符合专属资源池
*/
@@ -633,6 +643,7 @@ func CloudBrainTrainJobShow(ctx *context.Context) {
func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.JobType) {
ctx.Data["PageIsCloudBrain"] = true
debugListType := ctx.Query("debugListType")
cloudbrain.InitSpecialPool()
var task *models.Cloudbrain
var err error
@@ -644,22 +655,22 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
if err != nil {
log.Info("error:" + err.Error())
ctx.Data["error"] = err.Error( )
ctx.NotFound(ctx.Req.URL.RequestURI(), nil )
return
}
result, err := cloudbrain.GetJob(task.JobID)
if err != nil {
log.Info("error:" + err.Error())
ctx.Data["error"] = err.Error( )
ctx.NotFound(ctx.Req.URL.RequestURI(), nil )
return
}
hasSpec := false
if task.JobType == string(models.JobTypeTrain) {
if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
hasSpec := false
for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
@@ -667,24 +678,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
ctx.Data["CpuNum"] = tmp.CpuNum
ctx.Data["MemMiB"] = tmp.MemMiB
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB
}
}
if !hasSpec && cloudbrain.SpecialPools != nil {
for _, specialPool := range cloudbrain.SpecialPools.Pools {
if specialPool.ResourceSpec != nil {
for _, spec := range specialPool.ResourceSpec {
if task.ResourceSpecId == spec.Id {
ctx.Data["GpuNum"] = spec.GpuNum
ctx.Data["CpuNum"] = spec.CpuNum
ctx.Data["MemMiB"] = spec.MemMiB
ctx.Data["ShareMemMiB"] = spec.ShareMemMiB
break
}
}
}
break
}
}
@@ -694,10 +688,12 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
}
for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
ctx.Data["GpuNum"] = tmp.GpuNum
ctx.Data["CpuNum"] = tmp.CpuNum
ctx.Data["MemMiB"] = tmp.MemMiB
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB
break
}
}
} else {
@@ -706,10 +702,32 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
}
for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
ctx.Data["GpuNum"] = tmp.GpuNum
ctx.Data["CpuNum"] = tmp.CpuNum
ctx.Data["MemMiB"] = tmp.MemMiB
ctx.Data["ShareMemMiB"] = tmp.ShareMemMiB
break
}
}
}
if !hasSpec && cloudbrain.SpecialPools != nil {
for _, specialPool := range cloudbrain.SpecialPools.Pools {
if specialPool.ResourceSpec != nil {
for _, spec := range specialPool.ResourceSpec {
if task.ResourceSpecId == spec.Id {
ctx.Data["GpuNum"] = spec.GpuNum
ctx.Data["CpuNum"] = spec.CpuNum
ctx.Data["MemMiB"] = spec.MemMiB
ctx.Data["ShareMemMiB"] = spec.ShareMemMiB
break
}
}
}
}
}
@@ -728,14 +746,6 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
ctx.Data["resource_type"] = resourceType.Value
}
}
for _, specialPool := range cloudbrain.SpecialPools.Pools {
for _, resourceType := range specialPool.Pool {
if resourceType.Queue == jobRes.Config.GpuType {
ctx.Data["resource_type"] = resourceType.Value
}
}
}
} else if task.JobType == string(models.JobTypeInference) {
if inferenceGpuInfos == nil {
@@ -767,16 +777,30 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
}
}
}
if cloudbrain.SpecialPools != nil {
for _, specialPool := range cloudbrain.SpecialPools.Pools {
for _, resourceType := range specialPool.Pool {
if resourceType.Queue == jobRes.Config.GpuType {
ctx.Data["resource_type"] = resourceType.Value
}
}
}
}
taskRoles := jobRes.TaskRoles
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
ctx.Data["taskRes"] = taskRes
ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics
oldStatus := task.Status
task.Status = taskRes.TaskStatuses[0].State
task.ContainerID = taskRes.TaskStatuses[0].ContainerID
task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
models.ParseAndSetDurationFromCloudBrainOne(jobRes, task)
if task.DeletedAt.IsZero() { //normal record
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
ctx.Data["error"] = err.Error()
@@ -892,6 +916,20 @@ func CloudBrainCommitImageShow(ctx *context.Context) {
ctx.HTML(200, tplCloudBrainImageSubmit)
}
func GetImage(ctx *context.Context) {
var ID = ctx.Params(":id")
id, _ := strconv.ParseInt(ID, 10, 64)
image, err := models.GetImageByID(id)
if err != nil {
log.Error("GetImageByID failed:%v", err.Error())
ctx.JSON(http.StatusNotFound, nil)
}
ctx.JSON(http.StatusOK, image)
}
func CloudBrainImageEdit(ctx *context.Context) {
ctx.Data["PageIsImageEdit"] = true
ctx.Data["PageFrom"] = ctx.Params(":from")
@@ -1116,12 +1154,15 @@ func CloudBrainStop(ctx *context.Context) {
errorMsg = "cloudbrain.Stopped_failed"
break
}
oldStatus := task.Status
task.Status = string(models.JobStopped)
if task.EndTime == 0 {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
@@ -1215,11 +1256,15 @@ func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) {
if err != nil {
log.Warn("Failed to stop cloudBrain job:"+taskInfo.JobID, err)
} else {
oldStatus := taskInfo.Status
taskInfo.Status = string(models.JobStopped)
if taskInfo.EndTime == 0 {
taskInfo.EndTime = timeutil.TimeStampNow()
}
taskInfo.ComputeAndSetDuration()
if oldStatus != taskInfo.Status {
notification.NotifyChangeCloudbrainStatus(taskInfo, oldStatus)
}
err = models.UpdateJob(taskInfo)
if err != nil {
log.Warn("UpdateJob failed", err)
@@ -1699,9 +1744,13 @@ func SyncCloudbrainStatus() {
jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
taskRoles := jobRes.TaskRoles
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
oldStatus := task.Status
task.Status = taskRes.TaskStatuses[0].State
if task.Status != string(models.JobWaiting) {
models.ParseAndSetDurationFromCloudBrainOne(jobRes, task)
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -1728,6 +1777,9 @@ func SyncCloudbrainStatus() {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
@@ -1746,6 +1798,7 @@ func SyncCloudbrainStatus() {
}
if result != nil {
oldStatus := task.Status
task.Status = result.Status
if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
@@ -1755,6 +1808,9 @@ func SyncCloudbrainStatus() {
}
task.CorrectCreateUnix()
task.ComputeAndSetDuration()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -1769,6 +1825,7 @@ func SyncCloudbrainStatus() {
}
if result != nil {
oldStatus := task.Status
task.Status = modelarts.TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration / 1000
task.TrainJobDuration = result.TrainJobDuration
@@ -1781,6 +1838,9 @@ func SyncCloudbrainStatus() {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -1801,6 +1861,7 @@ func SyncCloudbrainStatus() {
if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
}
oldStatus := task.Status
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
task.Duration = result.JobInfo.RunSec
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
@@ -1812,6 +1873,9 @@ func SyncCloudbrainStatus() {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
@@ -2662,3 +2726,170 @@ func GetBenchmarkTypes(ctx *context.Context) *models.BenchmarkTypes {
}
return benchmarkTypesMap[lang]
}
func GetCloudbrainAiCenter(task models.Cloudbrain, ctx *context.Context) string {
if task.Type == models.TypeCloudBrainOne {
return ctx.Tr("repo.cloudbrain1")
} else if task.Type == models.TypeCloudBrainTwo {
return ctx.Tr("repo.cloudbrain2")
} else if task.Type == models.TypeC2Net {
return getCutStringAiCenterByAiCenter(task.AiCenter)
}
return ""
}
func getCutStringAiCenterByAiCenter(aiCenter string) string {
if aiCenter == "" {
return ""
}
index := strings.LastIndex(aiCenter, "+")
return aiCenter[index+1:]
}
func GetCloudbrainCluster(task models.Cloudbrain, ctx *context.Context) string {
if task.Type == models.TypeCloudBrainOne || task.Type == models.TypeCloudBrainTwo {
return ctx.Tr("cloudbrain.resource_cluster_openi")
} else if task.Type == models.TypeC2Net {
return ctx.Tr("cloudbrain.resource_cluster_c2net")
}
return ""
}
func GetCloudbrainCardDuration(task models.Cloudbrain) string {
cardNum, _, _ := GetCloudbrainCardNumAndType(task)
cardDuration := models.ConvertDurationToStr(int64(cardNum) * task.Duration)
return cardDuration
}
func GetCloudbrainWaitTime(task models.Cloudbrain) string {
var waitTime string
if task.Status == string(models.JobWaiting) {
waitTimeInt := time.Now().Unix() - task.CreatedUnix.AsTime().Unix()
waitTime = models.ConvertDurationToStr(waitTimeInt)
if waitTimeInt < 0 {
waitTime = "00:00:00"
}
} else if task.Status == string(models.JobStopped) && task.StartTime.AsTime().Unix() == 0 {
waitTimeInt := task.EndTime.AsTime().Unix() - task.CreatedUnix.AsTime().Unix()
waitTime = models.ConvertDurationToStr(waitTimeInt)
if waitTimeInt < 0 {
waitTime = "00:00:00"
}
} else {
waitTimeInt := task.StartTime.AsTime().Unix() - task.CreatedUnix.AsTime().Unix()
waitTime = models.ConvertDurationToStr(waitTimeInt)
if waitTimeInt < 0 {
waitTime = "00:00:00"
}
}
return waitTime
}
func GetCloudbrainCardNumAndType(task models.Cloudbrain) (int, string, error) {
if !models.SpecsMapInitFlag {
models.InitCloudbrainOneResourceSpecMap()
}
if !models.GpuInfosMapInitFlag {
models.InitCloudbrainOneGpuInfoMap()
}
flavorName, err := GetCloudbrainFlavorName(task)
if err != nil {
return 0, "", nil
}
return getCardNumAndTypeByFlavorname(flavorName)
}
func getCardNumAndTypeByFlavorname(FlavorName string) (int, string, error) {
if FlavorName == "" {
return 0, "", nil
} else {
var beginIndex = strings.Index(FlavorName, ":")
var lastIndex = strings.LastIndex(FlavorName, ":")
var endIndex = strings.Index(FlavorName, "*")
if endIndex >= (beginIndex+1) && lastIndex >= (endIndex+1) {
cardNum, err := strconv.Atoi(strings.TrimSpace(FlavorName[beginIndex+1 : endIndex]))
if err != nil {
log.Error("strconv.Atoi failed: %v", err)
return 0, "", err
}
cardType := strings.TrimSpace(FlavorName[endIndex+1 : lastIndex])
return cardNum, cardType, err
}
return 0, "", nil
}
}
func GetCloudbrainFlavorName(task models.Cloudbrain) (string, error) {
if task.Type == models.TypeCloudBrainOne {
resourceSpec, gpuInfo, err := getCloudBrainOneResourceSpec(task)
if err != nil {
log.Info("getCloudBrainOneResourceSpec err:", err)
return "", err
} else {
if resourceSpec == nil || gpuInfo == nil {
err := errors.New("resourceSpec or gpuInfo is nil")
return "", err
} else {
CloudbrainOneFlavorName := "GPU:" + strconv.Itoa(resourceSpec.GpuNum) + "*Nvidia-" + gpuInfo.Value +
" | CPU:" + strconv.Itoa(resourceSpec.CpuNum) + "核" + strconv.Itoa(resourceSpec.MemMiB) + "MB"
return CloudbrainOneFlavorName, nil
}
}
} else if (task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeC2Net) && task.FlavorName != "" {
replaceFlavorName := strings.ReplaceAll(task.FlavorName, ":", ":")
return replaceFlavorName, nil
} else if task.Type == models.TypeCloudBrainTwo && task.FlavorName == "" && task.FlavorCode != "" {
cloudbrainTwoFlavorName := getFlavorNameByFlavorCode(task.FlavorCode)
return cloudbrainTwoFlavorName, nil
} else if task.Type == models.TypeCloudBrainTwo && task.JobType == string(models.JobTypeDebug) && task.FlavorName == "" && task.FlavorCode == "" {
tasks, err := models.GetModelartsReDebugTaskByJobId(task.JobID)
if err != nil {
return "", err
}
if len(tasks) >= 1 {
return getFlavorNameByFlavorCode(tasks[0].FlavorCode), nil
}
return "", nil
}
return "", nil
}
func getCloudBrainOneResourceSpec(task models.Cloudbrain) (*models.ResourceSpec, *models.GpuInfo, error) {
gpuQueueDefault := "openidebug"
if task.GpuQueue != "" {
gpuQueueDefault = task.GpuQueue
}
if task.ResourceSpecId >= 0 {
if task.JobType == string(models.JobTypeTrain) {
if models.CloudbrainTrainResourceSpecsMap[task.ResourceSpecId] != nil {
return models.CloudbrainTrainResourceSpecsMap[task.ResourceSpecId], models.CloudbrainTrainGpuInfosMap[gpuQueueDefault], nil
} else {
return models.CloudbrainSpecialResourceSpecsMap[task.ResourceSpecId], models.CloudbrainSpecialGpuInfosMap[gpuQueueDefault], nil
}
} else if task.JobType == string(models.JobTypeDebug) {
if models.CloudbrainDebugResourceSpecsMap[task.ResourceSpecId] != nil {
return models.CloudbrainDebugResourceSpecsMap[task.ResourceSpecId], models.CloudbrainDebugGpuInfosMap[gpuQueueDefault], nil
} else {
return models.CloudbrainSpecialResourceSpecsMap[task.ResourceSpecId], models.CloudbrainSpecialGpuInfosMap[gpuQueueDefault], nil
}
} else if task.JobType == string(models.JobTypeInference) {
return models.CloudbrainInferenceResourceSpecsMap[task.ResourceSpecId], models.CloudbrainInferenceGpuInfosMap[gpuQueueDefault], nil
} else if task.JobType == string(models.JobTypeBenchmark) || task.JobType == string(models.JobTypeSnn4imagenet) || task.JobType == string(models.JobTypeBrainScore) {
return models.CloudbrainBenchmarkResourceSpecsMap[task.ResourceSpecId], models.CloudbrainBenchmarkGpuInfosMap[gpuQueueDefault], nil
}
} else {
err := errors.New("ResourceSpecId is null")
return nil, nil, err
}
return nil, nil, nil
}
func getFlavorNameByFlavorCode(flavorCode string) string {
index := strings.LastIndex(flavorCode, ".")
cardNum, err := strconv.Atoi(strings.TrimSpace(flavorCode[index+1 : len(flavorCode)]))
if err != nil {
log.Error("strconv.Atoi failed: %v", err)
return ""
}
cloudbrainTwoFlavorName := "Ascend:" + strings.TrimSpace(flavorCode[index+1:len(flavorCode)]) +
"*Ascend-910(" + strconv.Itoa(cardNum*32) + "GB)|ARM:" + strconv.Itoa(cardNum*24) +
"核" + strconv.Itoa(cardNum*256) + "GB"
return cloudbrainTwoFlavorName
}