|
- /* ******************************************************************************
- * 2019 - present Contributed by Apulis Technology (Shenzhen) Co. LTD
- *
- * This program and the accompanying materials are made available under the
- * terms of the MIT License, which is available at
- * https://www.opensource.org/licenses/MIT
- *
- * See the NOTICE file distributed with this work for additional
- * information regarding copyright ownership.
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: MIT
- ******************************************************************************/
- package services
-
- import (
- "fmt"
- "strconv"
- "strings"
-
- "github.com/apulis/app/ai-arts-backend/internal/configs"
- "github.com/apulis/app/ai-arts-backend/internal/dto"
- "github.com/apulis/app/ai-arts-backend/pkg/exports"
- )
-
- // compute device quota : CPU | GPU | NPU
- type AomComputeQuota struct {
- Type string `json:"type"`
- Arch string `json:"arch"`
- ComputeType string `json:"computeType"`
- DeviceType string `json:"model"`
- Series string `json:"series"`
- DeviceNum int `json:"num"`
- DeviceMax int `json:"max"`
- Used int `json:"used"`
- }
-
- const MAX_DEV_PER_NODE = 8
- const MAX_VCJOB_NODE = 32
-
- type UserGroupResourceQuota struct {
- MemMax int64 `json:"memMax"`
- MemUsed int64 `json:"memUsed"`
- Quotas []AomComputeQuota `json:"quotas"`
- }
-
- type AomQuotaDetail struct {
- AomComputeQuota
- Mem int64 `json:"mem"`
- CpuNum int `json:"cpuNum"`
- ID int `json:"id"`
- }
-
- func (d *UserGroupResourceQuota) GetMaxNodes(quota *AomQuotaDetail) int {
- n1 := 0
- if q := d.GetComputeLimit(quota); q == nil {
- return 0
- } else if quota.DeviceNum > 0 {
- n1 = int(q.DeviceNum / quota.DeviceNum)
- } else {
- n1 = int(q.DeviceNum / quota.CpuNum)
- }
- n2 := int(d.MemMax / quota.Mem)
- if n1 < n2 {
- n2 = n1
- }
- if quota.DeviceNum > 0 { //check cpu max nodes also
- n3 := d.GetCPULimit(quota) / quota.CpuNum
- if n3 < n2 {
- n2 = n3
- }
- }
- return n2
- }
-
- func (d*UserGroupResourceQuota) GetRequestLimits(quota* dto.Quota, q *AomQuotaDetail) {
- if quota.Node == 0 {
- quota.Node=1
- }
- //@modify: request & limit always remain the same !!!
- quota.Limit =quota.Request
- return
- quota.Limit.Device=quota.Request.Device
- //@todo: each request use 80% total group quota at most ???
- cpuLimit := d.GetCPULimit(q)/quota.Node
- memLimit := d.MemMax/int64(quota.Node)
-
- cpuRequest,_ := strconv.ParseInt(quota.Request.CPU,0,32)
- memRequest,_ := strconv.ParseInt(quota.Request.Memory,0,64)
- if cpuLimit < int(cpuRequest) {
- cpuLimit = int(cpuRequest)
- }
- if memLimit < memRequest {
- memLimit = memRequest
- }
- quota.Limit.CPU = fmt.Sprintf("%d",cpuLimit)
- quota.Limit.Memory = fmt.Sprintf("%d",memLimit)
- }
-
- func (d *UserGroupResourceQuota) GetCPULimit(quota *AomQuotaDetail) int {
- n := 0
- for _, v := range d.Quotas {
- if v.Type == "CPU" && (len(quota.Arch) == 0 || quota.Arch == v.Arch) {
- if v.DeviceNum > n {
- n = v.DeviceNum
- }
- //n += v.DeviceNum
- }
- }
- return n
- }
-
- func (d *UserGroupResourceQuota) GetComputeLimit(quota *AomQuotaDetail) *AomComputeQuota {
- for _, v := range d.Quotas {
- if v.Type == quota.Type && v.ComputeType == quota.ComputeType && v.Series == quota.Series {
- return &v
- }
- }
- return nil
- }
-
- func (d *UserGroupResourceQuota) GetCPULimitQuota(arch string) *AomComputeQuota {
- for _, v := range d.Quotas {
- if v.Type == "CPU" && v.Arch == arch{
- return &v
- }
- }
- return nil
- }
-
- func (d* UserGroupResourceQuota) CheckRequestFitLimit(quota * AomQuotaDetail,nodes int) string{
- if d.MemUsed + quota.Mem*int64(nodes) > d.MemMax {
- return "check memory used exceed quota !"
- }
- if q := d.GetComputeLimit(quota); q == nil {
- return fmt.Sprintf("not found request quota %s %s %s",quota.Type,quota.ComputeType,quota.Series)
- } else if quota.DeviceNum > 0 && quota.DeviceNum*nodes + q.Used > q.DeviceNum{
- return fmt.Sprintf("request device num %d used %d exceed %d !",quota.DeviceNum*nodes,q.Used,q.DeviceNum)
- } else if quota.DeviceNum == 0 && quota.CpuNum*nodes + q.Used > q.DeviceNum{
- return fmt.Sprintf("request cpu quota %d used %d exceed %d !",quota.CpuNum*nodes,q.Used,q.DeviceNum)
- }
- if quota.DeviceNum > 0 { //check cpu max nodes also
- if q:= d.GetCPULimitQuota(quota.Arch);q == nil {
- return fmt.Sprintf("not found request type %s cpu arch %s!",quota.Type,quota.Arch)
- }else if quota.CpuNum*nodes + q.Used > q.DeviceNum{
- return fmt.Sprintf("[cpu] request cpu quota %d used %d exceed %d !",quota.CpuNum*nodes,q.Used,q.DeviceNum)
- }
- }
- return ""
- }
-
- type AomNodeQuota struct {
- CpuArch string `json:"cpuArch"`
- CpuNum int `json:"cpuNum"`
- Mem int64 `json:"mem"`
- Status string `json:"status"`
- Roles string `json:"roles"`
- Devs []AomComputeQuota `json:"devs"`
- }
-
-
- func (d *AomNodeQuota) GetDeviceNum(q *AomQuotaDetail) int {
- if d.Mem < q.Mem || d.CpuNum < q.CpuNum {
- return 0
- }
- if len(q.Arch) != 0 && q.Arch != d.CpuArch {
- return 0
- }
- roles := strings.Split(d.Roles,",")
- for _,role := range(roles) {
- if role == "edge" {//@todo: to be changed !
- return 0
- }
- }
- for _, v := range d.Devs {
- if q.Type == v.Type && q.ComputeType == v.ComputeType && q.Series == v.Series {
- return v.DeviceNum
- }
- }
- return 0
- }
-
- func GetUserGroupQuota(bearToken string, groupId int64) (*UserGroupResourceQuota, exports.APIError) {
- url := fmt.Sprintf("%s/user-group-resources?userGroupId=%d", configs.GetAppConfig().Resources.Aom, groupId)
- type Response struct {
- Items []UserGroupResourceQuota `json:"items"`
- }
- var qutoaList Response
- if err := AIStudioRequest(url, "GET", map[string]string{
- "Authorization": bearToken,
- }, nil, &qutoaList); err != nil {
- return nil, err
- } else if len(qutoaList.Items) == 0 {
- return nil, exports.NotFoundError()
- } else {
- group := &qutoaList.Items[0]
- for idx, _ := range group.Quotas { //@modify: use max as device num limit !!!
- group.Quotas[idx].DeviceNum = group.Quotas[idx].DeviceMax
- }
- return group, nil
- }
- }
-
- func GetResourceQutoaMachines(bearToken string, orgId int64, quotaId int, cpuArch string) ([]AomNodeQuota, APIError) {
- if orgId == 0 || quotaId == 0 {
- return nil, exports.ParameterError("invalid orgId or quotaId !!!")
- }
- url := fmt.Sprintf("%s/nodes?resourceQuotaId=%d&orgId=%d&cpuArch=%s",
- configs.GetAppConfig().Resources.Aom, quotaId, orgId, cpuArch)
- type Response struct {
- Items []AomNodeQuota `json:"items"`
- }
- var nodeList Response
- if err := AIStudioRequest(url, "GET", map[string]string{
- "Authorization": bearToken,
- }, nil, &nodeList); err != nil {
- return nil, err
- } else {
- return nodeList.Items, nil
- }
- }
-
- func GetResourceQuotaDetail(bearToken string, quotaId int) (*AomQuotaDetail, APIError) {
- if quotaId == 0 {
- return nil, exports.ParameterError("invalid quotaId !!!")
- }
- url := fmt.Sprintf("%s/resource-quotas", configs.GetAppConfig().Resources.Aom)
- type Response struct {
- Items []AomQuotaDetail `json:"items"`
- }
- //fmt.Println("request", url)
- var quotaList Response
- if err := AIStudioRequest(url, "GET", map[string]string{
- "Authorization": bearToken,
- }, nil, "aList); err != nil {
- return nil, err
- }
- fmt.Println("resource quotas:", quotaList)
- for _, v := range quotaList.Items {
- if v.ID == quotaId {
- return &v, nil
- }
- }
- return nil, exports.NotFoundError()
- }
-
- func GetUserGroupQuotaLimits(bearToken string, groupId int64, orgId int64, quotaId int, cpuArch string,
- modelId uint64,modelVersionId uint64) ([]dto.ResourceQuotaLimit, APIError) {
-
- quota_detail, err := GetResourceQuotaDetail(bearToken, quotaId)
- //pd, _ := json.Marshal(quota_detail)
- //fmt.Println(string(pd))
- if err != nil {
- return nil, err
- }
- var dev_min_num = 1
- var max_nodes = MAX_VCJOB_NODE
- //@add: support for filter by manifest device description
- if modelId > 0 && modelVersionId > 0 {
- if dev_min_num,max_nodes,err = ValidateModelResourceQutoa(bearToken,quota_detail,modelId,modelVersionId);err != nil {
- return nil,err
- }
- }
- logger.Infof("quota_id:%d dev_min_num:%d max_nodes:%d",quotaId,dev_min_num,max_nodes)
-
- var nodes []AomNodeQuota
- nodes, err = GetResourceQutoaMachines(bearToken, orgId, quotaId, cpuArch)
- //pd, _ = json.Marshal(nodes)
- //fmt.Println(string(pd))
- if err != nil {
- return nil, err
- }
- if len(nodes) == 0 {
- return []dto.ResourceQuotaLimit{}, nil
- }
- var UserGroupQuota *UserGroupResourceQuota
- UserGroupQuota, err = GetUserGroupQuota(bearToken, groupId)
- if err != nil {
- return nil, err
- }
- qutoa_table := [MAX_DEV_PER_NODE + 1]int{}
-
- result := []dto.ResourceQuotaLimit{}
-
- if quota_detail.Type == "CPU" {
-
- for _, v := range nodes { // find match nodes
- if v.GetDeviceNum(quota_detail) > 0 {
- qutoa_table[0]++
- }
- }
- quota_detail.DeviceNum = 0
- if max := UserGroupQuota.GetMaxNodes(quota_detail); max < qutoa_table[0] {
- qutoa_table[0] = max
- }
-
- if qutoa_table[0] > 0 {
- result = append(result, dto.ResourceQuotaLimit{
- Num: 0,
- Node: qutoa_table[0],
- })
- }
- } else if quota_detail.Type == "GPU" ||
- quota_detail.Type == "NPU" && strings.Contains(quota_detail.Series, "310") {
-
- for _, v := range nodes {
- devNum := v.GetDeviceNum(quota_detail)
- for i := dev_min_num; i <= MAX_DEV_PER_NODE; i++ {
- if devNum >= i {
- qutoa_table[i]++
- } else {
- break
- }
- }
- }
- for i := dev_min_num; i <= MAX_DEV_PER_NODE; i++ {
- quota_detail.DeviceNum = i
- max := UserGroupQuota.GetMaxNodes(quota_detail)
- if max > qutoa_table[i] {
- max = qutoa_table[i]
- }
- if max > 0 {
- result = append(result, dto.ResourceQuotaLimit{
- Num: i,
- Node: max,
- })
- } else {
- break
- }
- }
- } else if quota_detail.Type == "NPU" {
-
- for _, v := range nodes {
- devNum := v.GetDeviceNum(quota_detail)
- for _, i := range []int{1, 2, 4, 8} {
- if i < dev_min_num {
- continue
- }else if devNum >= i {
- qutoa_table[i]++
- } else {
- break
- }
- }
- }
- for _, i := range []int{1, 2, 4, 8} {
- if i < dev_min_num {
- continue
- }
- quota_detail.DeviceNum = i
- max := UserGroupQuota.GetMaxNodes(quota_detail)
- if max > qutoa_table[i] {
- max = qutoa_table[i]
- }
- if max > 0 {
- result = append(result, dto.ResourceQuotaLimit{
- Num: i,
- Node: max,
- })
- } else {
- break
- }
- }
- } else {
- return nil, exports.ParameterError("invalid device type:" + quota_detail.Type)
- }
- for idx,_ := range result {//cannot exceed max nodes !!!
- if result[idx].Node > max_nodes {
- result[idx].Node = max_nodes
- }
- }
- return result, nil
- }
-
- func ValidateUserGroupQuota(bearToken string, groupId int64, quotaId int, num int, nodes int,modelInfo *dto.GetModelVersionDetailResp, checkUsed bool) (*dto.Quota, APIError) {
- quota_detail, err := GetResourceQuotaDetail(bearToken, quotaId)
- if err != nil {
- return nil, err
- }
- //@add: check model device description quota
- if modelInfo != nil {
- if dev_min_num,max_nodes,err := ValidateModelResourceQutoa2(quota_detail,modelInfo);err != nil {
- return nil,err
- }else if num < dev_min_num || nodes > max_nodes{
- logger.Infof("ValidateUserGroupQuota: quota_id:%d dev_min_num:%d max_nodes:%d",quotaId,dev_min_num,max_nodes)
- return nil,exports.RaiseAPIError(exports.AIARTS_PARAM_ERROR,"num or nodes does not match model description !")
- }
- }
-
-
- var UserGroupQuota *UserGroupResourceQuota
- UserGroupQuota, err = GetUserGroupQuota(bearToken, groupId)
- if err != nil {
- return nil, err
- }
- if quota_detail.Type == "CPU" {
- quota_detail.DeviceNum = 0
- } else {
- quota_detail.DeviceNum = num
- }
- if UserGroupQuota.GetMaxNodes(quota_detail) < nodes {
- return nil, exports.RaiseAPIError(exports.AIARTS_COMPUTE_EXCEED_LIMIT)
- }
- if checkUsed { //@add: check used + request not exceed limit ???
- if err:=UserGroupQuota.CheckRequestFitLimit(quota_detail,nodes);err != "" {
- return nil,exports.RaiseReqWouldBlock(err)
- }
- }
- quota := &dto.Quota{
- Request: dto.ResourceData{
- CPU: fmt.Sprintf("%d", quota_detail.CpuNum),
- Memory: fmt.Sprintf("%d", quota_detail.Mem),
- Device: dto.ResourceDevice{
- DeviceType: quota_detail.DeviceType,
- DeviceNum: fmt.Sprintf("%d", quota_detail.DeviceNum),
- Series: quota_detail.Series,
- ComputeType: quota_detail.ComputeType,
- },
- },
- Node: nodes,
- Arch: quota_detail.Arch,
- }
- //quota.Limit = quota.Request
- UserGroupQuota.GetRequestLimits(quota,quota_detail)
-
- return quota,nil
- }
|