OpenI
/
Apulis-AIArts-Backend

 
			
							/* ******************************************************************************
* 2019 - present Contributed by Apulis Technology (Shenzhen) Co. LTD
*
* This program and the accompanying materials are made available under the
* terms of the MIT License, which is available at
* https://www.opensource.org/licenses/MIT
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
* SPDX-License-Identifier: MIT
******************************************************************************/
package services

import (
	"fmt"
	"strconv"
	"strings"

	"github.com/apulis/app/ai-arts-backend/internal/configs"
	"github.com/apulis/app/ai-arts-backend/internal/dto"
	"github.com/apulis/app/ai-arts-backend/pkg/exports"
)

// compute device quota : CPU | GPU | NPU
type AomComputeQuota struct {
	Type        string `json:"type"`
	Arch        string `json:"arch"`
	ComputeType string `json:"computeType"`
	DeviceType  string `json:"model"`
	Series      string `json:"series"`
	DeviceNum   int    `json:"num"`
	DeviceMax   int    `json:"max"`
	Used        int    `json:"used"`
}

const MAX_DEV_PER_NODE = 8
const MAX_VCJOB_NODE   = 32

type UserGroupResourceQuota struct {
	MemMax  int64               `json:"memMax"`
	MemUsed int64               `json:"memUsed"`
	Quotas []AomComputeQuota   `json:"quotas"`
}

type AomQuotaDetail struct {
	AomComputeQuota
	Mem    int64 `json:"mem"`
	CpuNum int `json:"cpuNum"`
	ID     int `json:"id"`
}

func (d *UserGroupResourceQuota) GetMaxNodes(quota *AomQuotaDetail) int {
	n1 := 0
	if q := d.GetComputeLimit(quota); q == nil {
		return 0
	} else if quota.DeviceNum > 0 {
		n1 = int(q.DeviceNum / quota.DeviceNum)
	} else {
		n1 = int(q.DeviceNum / quota.CpuNum)
	}
	n2 := int(d.MemMax / quota.Mem)
	if n1 < n2 {
		n2 = n1
	}
	if quota.DeviceNum > 0 { //check cpu max nodes also
		n3 := d.GetCPULimit(quota) / quota.CpuNum
		if n3 < n2 {
			n2 = n3
		}
	}
	return n2
}

func (d*UserGroupResourceQuota)  GetRequestLimits(quota* dto.Quota, q *AomQuotaDetail) {
	 if quota.Node == 0 {
	 	quota.Node=1
	 }
	 //@modify:  request & limit always remain the same !!!
	 quota.Limit =quota.Request
	 return
	 quota.Limit.Device=quota.Request.Device
	 //@todo: each request use 80% total group quota at most ???
	 cpuLimit := d.GetCPULimit(q)/quota.Node
	 memLimit := d.MemMax/int64(quota.Node)

	 cpuRequest,_ := strconv.ParseInt(quota.Request.CPU,0,32)
	 memRequest,_ := strconv.ParseInt(quota.Request.Memory,0,64)
	 if cpuLimit < int(cpuRequest) {
	 	cpuLimit = int(cpuRequest)
	 }
	 if memLimit < memRequest {
	 	memLimit = memRequest
	 }
	 quota.Limit.CPU =  fmt.Sprintf("%d",cpuLimit)
	 quota.Limit.Memory = fmt.Sprintf("%d",memLimit)
}

func (d *UserGroupResourceQuota) GetCPULimit(quota *AomQuotaDetail) int {
	n := 0
	for _, v := range d.Quotas {
		if v.Type == "CPU" && (len(quota.Arch) == 0 || quota.Arch == v.Arch) {
			if v.DeviceNum > n {
				n = v.DeviceNum
			}
			//n += v.DeviceNum
		}
	}
	return n
}

func (d *UserGroupResourceQuota) GetComputeLimit(quota *AomQuotaDetail) *AomComputeQuota {
	for _, v := range d.Quotas {
		if v.Type == quota.Type && v.ComputeType == quota.ComputeType && v.Series == quota.Series {
			return &v
		}
	}
	return nil
}

func (d *UserGroupResourceQuota) GetCPULimitQuota(arch string) *AomComputeQuota {
	for _, v := range d.Quotas {
		if v.Type == "CPU" &&   v.Arch == arch{
			return &v
		}
	}
	return nil
}

func (d* UserGroupResourceQuota) CheckRequestFitLimit(quota * AomQuotaDetail,nodes int)  string{
    if d.MemUsed + quota.Mem*int64(nodes) > d.MemMax {
    	return "check memory used exceed quota !"
    }
	if q := d.GetComputeLimit(quota); q == nil {
		return fmt.Sprintf("not found request quota %s %s %s",quota.Type,quota.ComputeType,quota.Series)
	} else if quota.DeviceNum > 0 &&  quota.DeviceNum*nodes + q.Used > q.DeviceNum{
		return fmt.Sprintf("request device num %d used %d exceed %d !",quota.DeviceNum*nodes,q.Used,q.DeviceNum)
	} else if quota.DeviceNum == 0 && quota.CpuNum*nodes  + q.Used > q.DeviceNum{
		return fmt.Sprintf("request cpu quota %d used %d exceed %d !",quota.CpuNum*nodes,q.Used,q.DeviceNum)
	}
	if quota.DeviceNum > 0 { //check cpu max nodes also
		 if q:= d.GetCPULimitQuota(quota.Arch);q == nil {
		 	return fmt.Sprintf("not found request type %s cpu arch %s!",quota.Type,quota.Arch)
		 }else if quota.CpuNum*nodes + q.Used > q.DeviceNum{
		 	return fmt.Sprintf("[cpu] request cpu quota %d used %d exceed %d !",quota.CpuNum*nodes,q.Used,q.DeviceNum)
		 }
	}
	return ""
}

type AomNodeQuota struct {
	CpuArch string            `json:"cpuArch"`
	CpuNum  int               `json:"cpuNum"`
	Mem     int64             `json:"mem"`
	Status  string            `json:"status"`
	Roles   string            `json:"roles"`
	Devs    []AomComputeQuota `json:"devs"`
}


func (d *AomNodeQuota) GetDeviceNum(q *AomQuotaDetail) int {
	if d.Mem < q.Mem || d.CpuNum < q.CpuNum {
		return 0
	}
	if len(q.Arch) != 0 && q.Arch != d.CpuArch {
		return 0
	}
	roles := strings.Split(d.Roles,",")
	for _,role := range(roles) {
		if role == "edge" {//@todo: to be changed !
			return 0
		}
	}
	for _, v := range d.Devs {
		if q.Type == v.Type && q.ComputeType == v.ComputeType && q.Series == v.Series {
			return v.DeviceNum
		}
	}
	return 0
}

func GetUserGroupQuota(bearToken string, groupId int64) (*UserGroupResourceQuota, exports.APIError) {
	url := fmt.Sprintf("%s/user-group-resources?userGroupId=%d", configs.GetAppConfig().Resources.Aom, groupId)
	type Response struct {
		Items []UserGroupResourceQuota `json:"items"`
	}
	var qutoaList Response
	if err := AIStudioRequest(url, "GET", map[string]string{
		"Authorization": bearToken,
	}, nil, &qutoaList); err != nil {
		return nil, err
	} else if len(qutoaList.Items) == 0 {
		return nil, exports.NotFoundError()
	} else {
		group := &qutoaList.Items[0]
		for idx, _ := range group.Quotas { //@modify:  use max as device num limit !!!
			group.Quotas[idx].DeviceNum = group.Quotas[idx].DeviceMax
		}
		return group, nil
	}
}

func GetResourceQutoaMachines(bearToken string, orgId int64, quotaId int, cpuArch string) ([]AomNodeQuota, APIError) {
	if orgId == 0 || quotaId == 0 {
		return nil, exports.ParameterError("invalid orgId or quotaId !!!")
	}
	url := fmt.Sprintf("%s/nodes?resourceQuotaId=%d&orgId=%d&cpuArch=%s",
		configs.GetAppConfig().Resources.Aom, quotaId, orgId, cpuArch)
	type Response struct {
		Items []AomNodeQuota `json:"items"`
	}
	var nodeList Response
	if err := AIStudioRequest(url, "GET", map[string]string{
		"Authorization": bearToken,
	}, nil, &nodeList); err != nil {
		return nil, err
	} else {
		return nodeList.Items, nil
	}
}

func GetResourceQuotaDetail(bearToken string, quotaId int) (*AomQuotaDetail, APIError) {
	if quotaId == 0 {
		return nil, exports.ParameterError("invalid quotaId !!!")
	}
	url := fmt.Sprintf("%s/resource-quotas", configs.GetAppConfig().Resources.Aom)
	type Response struct {
		Items []AomQuotaDetail `json:"items"`
	}
	//fmt.Println("request", url)
	var quotaList Response
	if err := AIStudioRequest(url, "GET", map[string]string{
		"Authorization": bearToken,
	}, nil, &quotaList); err != nil {
		return nil, err
	}
	fmt.Println("resource quotas:", quotaList)
	for _, v := range quotaList.Items {
		if v.ID == quotaId {
			return &v, nil
		}
	}
	return nil, exports.NotFoundError()
}

func GetUserGroupQuotaLimits(bearToken string, groupId int64, orgId int64, quotaId int, cpuArch string,
	      modelId uint64,modelVersionId uint64) ([]dto.ResourceQuotaLimit, APIError) {

	quota_detail, err := GetResourceQuotaDetail(bearToken, quotaId)
	//pd, _ := json.Marshal(quota_detail)
	//fmt.Println(string(pd))
	if err != nil {
		return nil, err
	}
	var dev_min_num = 1
	var max_nodes   = MAX_VCJOB_NODE
	//@add: support for filter by manifest device description
	if modelId > 0 && modelVersionId > 0 {
		if dev_min_num,max_nodes,err = ValidateModelResourceQutoa(bearToken,quota_detail,modelId,modelVersionId);err != nil {
			return nil,err
		}
	}
	logger.Infof("quota_id:%d dev_min_num:%d max_nodes:%d",quotaId,dev_min_num,max_nodes)

	var nodes []AomNodeQuota
	nodes, err = GetResourceQutoaMachines(bearToken, orgId, quotaId, cpuArch)
	//pd, _ = json.Marshal(nodes)
	//fmt.Println(string(pd))
	if err != nil {
		return nil, err
	}
	if len(nodes) == 0 {
		return []dto.ResourceQuotaLimit{}, nil
	}
	var UserGroupQuota *UserGroupResourceQuota
	UserGroupQuota, err = GetUserGroupQuota(bearToken, groupId)
	if err != nil {
		return nil, err
	}
	qutoa_table := [MAX_DEV_PER_NODE + 1]int{}

	result := []dto.ResourceQuotaLimit{}

	if quota_detail.Type == "CPU" {

		for _, v := range nodes { // find match nodes
			if v.GetDeviceNum(quota_detail) > 0 {
				qutoa_table[0]++
			}
		}
		quota_detail.DeviceNum = 0
		if max := UserGroupQuota.GetMaxNodes(quota_detail); max < qutoa_table[0] {
			qutoa_table[0] = max
		}

		if qutoa_table[0] > 0 {
			result = append(result, dto.ResourceQuotaLimit{
				Num:  0,
				Node: qutoa_table[0],
			})
		}
	} else if quota_detail.Type == "GPU" ||
		quota_detail.Type == "NPU" && strings.Contains(quota_detail.Series, "310") {

		for _, v := range nodes {
			devNum := v.GetDeviceNum(quota_detail)
			for i := dev_min_num; i <= MAX_DEV_PER_NODE; i++ {
				if devNum >= i {
					qutoa_table[i]++
				} else {
					break
				}
			}
		}
		for i := dev_min_num; i <= MAX_DEV_PER_NODE; i++ {
			quota_detail.DeviceNum = i
			max := UserGroupQuota.GetMaxNodes(quota_detail)
			if max > qutoa_table[i] {
				max = qutoa_table[i]
			}
			if max > 0 {
				result = append(result, dto.ResourceQuotaLimit{
					Num:  i,
					Node: max,
				})
			} else {
				break
			}
		}
	} else if quota_detail.Type == "NPU" {

		for _, v := range nodes {
			devNum := v.GetDeviceNum(quota_detail)
			for _, i := range []int{1, 2, 4, 8} {
				if i < dev_min_num {
					continue
				}else if devNum >= i {
					qutoa_table[i]++
				} else {
					break
				}
			}
		}
		for _, i := range []int{1, 2, 4, 8} {
			if i <  dev_min_num {
				continue
			}
			quota_detail.DeviceNum = i
			max := UserGroupQuota.GetMaxNodes(quota_detail)
			if max > qutoa_table[i] {
				max = qutoa_table[i]
			}
			if max > 0 {
				result = append(result, dto.ResourceQuotaLimit{
					Num:  i,
					Node: max,
				})
			} else {
				break
			}
		}
	} else {
		return nil, exports.ParameterError("invalid device type:" + quota_detail.Type)
	}
	for idx,_ := range result {//cannot exceed max nodes !!!
		if result[idx].Node > max_nodes {
			result[idx].Node = max_nodes
		}
	}
	return result, nil
}

func ValidateUserGroupQuota(bearToken string, groupId int64, quotaId int, num int, nodes int,modelInfo *dto.GetModelVersionDetailResp, checkUsed bool) (*dto.Quota, APIError) {
	quota_detail, err := GetResourceQuotaDetail(bearToken, quotaId)
	if err != nil {
		return nil, err
	}
	//@add: check model device description quota
	if modelInfo != nil {
		if dev_min_num,max_nodes,err := ValidateModelResourceQutoa2(quota_detail,modelInfo);err != nil {
			return nil,err
		}else if num < dev_min_num || nodes > max_nodes{
			logger.Infof("ValidateUserGroupQuota: quota_id:%d dev_min_num:%d max_nodes:%d",quotaId,dev_min_num,max_nodes)
            return nil,exports.RaiseAPIError(exports.AIARTS_PARAM_ERROR,"num or nodes does not match model description !")
		}
	}


	var UserGroupQuota *UserGroupResourceQuota
	UserGroupQuota, err = GetUserGroupQuota(bearToken, groupId)
	if err != nil {
		return nil, err
	}
	if quota_detail.Type == "CPU" {
		quota_detail.DeviceNum = 0
	} else {
		quota_detail.DeviceNum = num
	}
	if UserGroupQuota.GetMaxNodes(quota_detail) < nodes {
		return nil, exports.RaiseAPIError(exports.AIARTS_COMPUTE_EXCEED_LIMIT)
	}
	if checkUsed { //@add: check used + request not exceed limit ???
        if err:=UserGroupQuota.CheckRequestFitLimit(quota_detail,nodes);err != "" {
            return nil,exports.RaiseReqWouldBlock(err)
        }
	}
	quota := &dto.Quota{
		Request: dto.ResourceData{
			CPU:    fmt.Sprintf("%d", quota_detail.CpuNum),
			Memory: fmt.Sprintf("%d", quota_detail.Mem),
			Device: dto.ResourceDevice{
				DeviceType:  quota_detail.DeviceType,
				DeviceNum:   fmt.Sprintf("%d", quota_detail.DeviceNum),
				Series:      quota_detail.Series,
				ComputeType: quota_detail.ComputeType,
			},
		},
		Node:    nodes,
		Arch:    quota_detail.Arch,
	}
	//quota.Limit = quota.Request
	UserGroupQuota.GetRequestLimits(quota,quota_detail)

	return  quota,nil
}