#4080 fix-4066--智算NPU任务给指定组织配置多节点

Merged
zouap merged 2 commits from fix-3956 into V20230425 1 year ago
  1. +14
    -13
      models/cloudbrain.go
  2. +24
    -12
      modules/grampus/grampus.go
  3. +2
    -0
      modules/setting/setting.go
  4. +12
    -1
      routers/api/v1/user/repo.go
  5. +21
    -0
      routers/repo/grampus.go
  6. +30
    -0
      services/cloudbrain/cloudbrainTask/train.go
  7. +15
    -1
      templates/repo/grampus/trainjob/npu/new.tmpl

+ 14
- 13
models/cloudbrain.go View File

@@ -1700,19 +1700,20 @@ type GetGrampusDebugJobEventsResponse struct {
}

type GrampusTasks struct {
Command string `json:"command"`
Name string `json:"name"`
ImageId string `json:"imageId"`
ResourceSpecId string `json:"resourceSpecId"`
ImageUrl string `json:"imageUrl"`
CenterID []string `json:"centerID"`
CenterName []string `json:"centerName"`
ReplicaNum int `json:"replicaNum"`
Datasets []GrampusDataset `json:"datasets"`
Models []GrampusDataset `json:"models"`
Code GrampusDataset `json:"code"`
BootFile string `json:"bootFile"`
OutPut GrampusDataset `json:"output"`
Command string `json:"command"`
Name string `json:"name"`
ImageId string `json:"imageId"`
ResourceSpecId string `json:"resourceSpecId"`
ImageUrl string `json:"imageUrl"`
CenterID []string `json:"centerID"`
CenterName []string `json:"centerName"`
ReplicaNum int `json:"replicaNum"`
Datasets []GrampusDataset `json:"datasets"`
Models []GrampusDataset `json:"models"`
Code GrampusDataset `json:"code"`
BootFile string `json:"bootFile"`
OutPut GrampusDataset `json:"output"`
WorkServerNumber int `json:"nodeCount"`
}
type GrampusNotebookTask struct {
AutoStopDuration int `json:"autoStopDuration"`


+ 24
- 12
modules/grampus/grampus.go View File

@@ -6,6 +6,8 @@ import (
"strconv"
"strings"

"code.gitea.io/gitea/modules/modelarts"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/context"
@@ -44,6 +46,8 @@ var (
SpecialPools *models.SpecialPools

CommandPrepareScriptGpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;"

MultiNodeConfig *modelarts.MultiNodes
)

type GenerateTrainJobReq struct {
@@ -419,18 +423,19 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str
Name: req.JobName,
Tasks: []models.GrampusTasks{
{
Name: req.JobName,
Command: req.Command,
ResourceSpecId: req.Spec.SourceSpecId,
ImageId: req.ImageId,
ImageUrl: req.ImageUrl,
CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
ReplicaNum: 1,
Datasets: datasetGrampus,
Models: modelGrampus,
Code: codeGrampus,
BootFile: req.BootFile,
OutPut: outputGrampus,
Name: req.JobName,
Command: req.Command,
ResourceSpecId: req.Spec.SourceSpecId,
ImageId: req.ImageId,
ImageUrl: req.ImageUrl,
CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
ReplicaNum: 1,
Datasets: datasetGrampus,
Models: modelGrampus,
Code: codeGrampus,
BootFile: req.BootFile,
OutPut: outputGrampus,
WorkServerNumber: req.WorkServerNumber,
},
},
})
@@ -599,3 +604,10 @@ func GetCenterProxy(aiCenterID string) string {

return proxy
}

func InitMultiNode() {
if MultiNodeConfig == nil && setting.Grampus.MultiNode != "" {
json.Unmarshal([]byte(setting.Grampus.MultiNode), &MultiNodeConfig)
}

}

+ 2
- 0
modules/setting/setting.go View File

@@ -637,6 +637,7 @@ var (
AiCenterCodeAndNameInfo string
UsageRateBeginTime string
GPUImageCommonName string
MultiNode string
}{}

ClearStrategy = struct {
@@ -1847,6 +1848,7 @@ func getGrampusConfig() {
log.Error("Unmarshal(AiCenterInfo) failed:%v", err)
}
}
Grampus.MultiNode = sec.Key("MULTI_NODE").MustString("")

}



+ 12
- 1
routers/api/v1/user/repo.go View File

@@ -5,9 +5,11 @@
package user

import (
"code.gitea.io/gitea/modules/modelarts"
"net/http"

"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/modelarts"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/context"
api "code.gitea.io/gitea/modules/structs"
@@ -151,6 +153,15 @@ func ListOrgRepos(ctx *context.APIContext) {
func GetComputeNodes(ctx *context.APIContext) {
taskeType := ctx.QueryInt("type")
if taskeType == 2 {
grampus.InitMultiNode()
if grampus.MultiNodeConfig != nil {
for _, info := range grampus.MultiNodeConfig.Info {
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg {
ctx.JSON(http.StatusOK, info.Node)
return
}
}
}
ctx.JSON(http.StatusOK, []int{1})
} else {
modelarts.InitMultiNode()


+ 21
- 0
routers/repo/grampus.go View File

@@ -438,6 +438,7 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err
ctx.Data["WaitCount"] = waitCount
NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, string(models.JobTypeTrain))
ctx.Data["NotStopTaskCount"] = NotStopTaskCount
setGrampusMultiNodeIfConfigureMatch(ctx)
} else if processType == grampus.ProcessorTypeGCU {
ctx.Data["datasetType"] = models.TypeCloudBrainAll
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GCUResource, models.JobTypeTrain)
@@ -1102,6 +1103,14 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
return
}

errMsg = cloudbrainTask.CheckGrampusNPUMultiNode(ctx.User.ID, form.WorkServerNumber)
if errMsg != "" {
log.Error("multi node match failed:%s", errMsg, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr(errMsg), tpl, &form)
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
@@ -1275,6 +1284,18 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

func setGrampusMultiNodeIfConfigureMatch(ctx *context.Context) {
grampus.InitMultiNode()
if grampus.MultiNodeConfig != nil {
for _, info := range grampus.MultiNodeConfig.Info {
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, ctx.User.ID); isInOrg {
ctx.Data["WorkNode"] = info.Node
break
}
}
}
}

func GetGrampusNotebook(ctx *context.APIContext) {
var (
err error


+ 30
- 0
services/cloudbrain/cloudbrainTask/train.go View File

@@ -574,6 +574,12 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
return
}

errMsg := CheckGrampusNPUMultiNode(ctx.User.ID, option.WorkServerNumber)
if errMsg != "" {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr(errMsg)))
return
}

//prepare code and out path
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
@@ -735,6 +741,30 @@ func paramCheckCreateTrainJob(bootFile string, branchName string) error {

return nil
}

func CheckGrampusNPUMultiNode(userId int64, serverNum int) string {
if serverNum == 1 {
return ""
}
grampus.InitMultiNode()
var isServerNumValid = false
if grampus.MultiNodeConfig != nil {
for _, info := range grampus.MultiNodeConfig.Info {
if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg {
if isInNodes(info.Node, serverNum) {
isServerNumValid = true
break
}

}
}
}
if isServerNumValid {
return ""
} else {
return "repo.modelarts.no_node_right"
}
}
func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
archiveType := git.ZIP
archivePath := codePath


+ 15
- 1
templates/repo/grampus/trainjob/npu/new.tmpl View File

@@ -179,7 +179,21 @@
<input style="border-radius: 0;text-align: center;"type="hidden" name="work_server_number" id="trainjob_work_server_num" tabindex="3" autofocus required maxlength="255" value="1" readonly>
<div class="field" id="trainjob_work_server_num_select" name="work_server_number_select">
<select class="ui dropdown width" style='width: 100%;' name="work_server_id">
<option name="server_id" value="1">1</option>
{{if .WorkNode}}
{{range .WorkNode}}
{{if $.work_server_number}}
{{if eq . $.work_server_number }}
<option name="server_id" selected value="{{.}}">{{.}}</option>
{{else}}
<option name="server_id" value="{{.}}">{{.}}</option>
{{end}}
{{else}}
<option name="server_id" value="{{.}}">{{.}}</option>
{{end}}
{{end}}
{{else}}
<option name="server_id" value="1">1</option>
{{end}}
</select>
</div>



Loading…
Cancel
Save