|
|
@@ -438,6 +438,7 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err |
|
|
|
ctx.Data["WaitCount"] = waitCount |
|
|
|
NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, string(models.JobTypeTrain)) |
|
|
|
ctx.Data["NotStopTaskCount"] = NotStopTaskCount |
|
|
|
setGrampusMultiNodeIfConfigureMatch(ctx) |
|
|
|
} else if processType == grampus.ProcessorTypeGCU { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainAll |
|
|
|
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GCUResource, models.JobTypeTrain) |
|
|
@@ -1102,6 +1103,14 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
errMsg = cloudbrainTask.CheckGrampusNPUMultiNode(ctx.User.ID, form.WorkServerNumber) |
|
|
|
if errMsg != "" { |
|
|
|
log.Error("multi node match failed:%s", errMsg, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName) |
|
|
|
if err != nil || !bootFileExist { |
|
|
|
log.Error("Get bootfile error:", err, ctx.Data["MsgID"]) |
|
|
@@ -1275,6 +1284,18 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") |
|
|
|
} |
|
|
|
|
|
|
|
func setGrampusMultiNodeIfConfigureMatch(ctx *context.Context) { |
|
|
|
grampus.InitMultiNode() |
|
|
|
if grampus.MultiNodeConfig != nil { |
|
|
|
for _, info := range grampus.MultiNodeConfig.Info { |
|
|
|
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg { |
|
|
|
ctx.Data["WorkNode"] = info.Node |
|
|
|
break |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
func GetGrampusNotebook(ctx *context.APIContext) { |
|
|
|
var ( |
|
|
|
err error |
|
|
|