#498 master

Merged
yangxzh1 merged 4 commits from openioctopus/octopus:master into master 1 year ago
  1. +52
    -0
      deploy/charts/octopus/templates/rmda.yaml
  2. +33
    -20
      deploy/charts/octopus/values.yaml
  3. +1
    -0
      server/base-server/internal/common/constant.go
  4. +10
    -0
      server/base-server/internal/service/develop/develop.go
  5. +10
    -0
      server/base-server/internal/service/trainjob/train_job.go

+ 52
- 0
deploy/charts/octopus/templates/rmda.yaml View File

@@ -0,0 +1,52 @@
{{- if .Values.rdma.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: rdma-devices
namespace: kube-system
data:
config.json: {{ .Values.rdma.config | toJson }}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: rdma-shared-dp-ds
namespace: kube-system
spec:
selector:
matchLabels:
name: rdma-shared-dp-ds
template:
metadata:
labels:
name: rdma-shared-dp-ds
spec:
hostNetwork: true
priorityClassName: system-node-critical
containers:
- image: mellanox/k8s-rdma-shared-dev-plugin
name: k8s-rdma-shared-dp-ds
imagePullPolicy: IfNotPresent
securityContext:
privileged: true
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/
- name: config
mountPath: /k8s-rdma-shared-dev-plugin
- name: devs
mountPath: /dev/
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/
- name: config
configMap:
name: rdma-devices
items:
- key: config.json
path: config.json
- name: devs
hostPath:
path: /dev/
{{- end }}

+ 33
- 20
deploy/charts/octopus/values.yaml View File

@@ -17,18 +17,18 @@ global:
affinity: &affinity
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]
- weight: 100
preference:
matchExpressions:
- key: beta.kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]
- weight: 100
preference:
matchExpressions:
- key: beta.kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]

common:
resourceTagKey: octopus.pcl.ac.cn/type
@@ -250,7 +250,7 @@ scheduler:
replicaCount: 1
image:
name: "scheduler"

# controller
controller:
@@ -382,13 +382,13 @@ influxdb:
enabled: true
existingClaim: "octopus-influxdb-pvc"
volumes:
- name: influxdb-initdb
configMap:
name: influxdb-initdb-config
- name: influxdb-initdb
configMap:
name: influxdb-initdb-config
mountPoints:
- name: influxdb-initdb
mountPath: /docker-entrypoint-initdb.d
readOnly: true
- name: influxdb-initdb
mountPath: /docker-entrypoint-initdb.d
readOnly: true
nodeSelector:
<<: *nodeSelector
affinity:
@@ -503,4 +503,17 @@ sftpgo:
data_provider:
create_default_admin: true
nodeSelector:
<<: *nodeSelector
<<: *nodeSelector

rdma:
enabled: false
config: |
{
"periodicUpdateInterval": 300,
"configList": [{
"resourceName": "hca",
"rdmaHcaMax": 1000,
"devices": ["ib0"]
}
]
}

+ 1
- 0
server/base-server/internal/common/constant.go View File

@@ -2,6 +2,7 @@ package common

const (
BillingPrecision = 2
RdmaPrefix = "rdma/"
)

const (


+ 10
- 0
server/base-server/internal/service/develop/develop.go View File

@@ -635,6 +635,16 @@ func (s *developService) submitJob(ctx context.Context, nb *model.Notebook, nbJo
},
},
}

for k, _ := range startJobInfo.resources {
if strings.HasPrefix(string(k), common.RdmaPrefix) {
task.Template.Spec.Containers[0].SecurityContext = &v1.SecurityContext{
Capabilities: &v1.Capabilities{
Add: []v1.Capability{"IPC_LOCK"},
},
}
}
}
tasks = append(tasks, task)
}



+ 10
- 0
server/base-server/internal/service/trainjob/train_job.go View File

@@ -612,6 +612,16 @@ func (s *trainJobService) submitJob(ctx context.Context, job *model.TrainJob, st
{Event: vcBus.TaskCompletedEvent, Action: vcBus.CompleteJobAction},
}
}

for k, _ := range startJobInfo.specs[i.ResourceSpecId].resources {
if strings.HasPrefix(string(k), common.RdmaPrefix) {
task.Template.Spec.Containers[0].SecurityContext = &v1.SecurityContext{
Capabilities: &v1.Capabilities{
Add: []v1.Capability{"IPC_LOCK"},
},
}
}
}
tasks = append(tasks, task)
}



Loading…
Cancel
Save