Browse Source

Merge pull request '#12, #62, #64' (#87) from openioctopus/octopus:initInfo into master

Reviewed-on: #87
master
liwei03 1 week ago
parent
commit
c923472b23
47 changed files with 1884 additions and 221 deletions
  1. +48
    -0
      .drone.yml
  2. +2
    -0
      .golangci.yaml
  3. +17
    -2
      Makefile
  4. +9
    -0
      build/application/api-doc/dockerfile
  5. +2
    -1
      build/basic_environment/kratos-v2/dockerfile
  6. +3
    -0
      deploy/charts/octopus/Chart.yaml
  7. BIN
      deploy/charts/octopus/charts/influxdb-2.3.9.tgz
  8. BIN
      deploy/charts/octopus/charts/influxdb-4.10.0.tgz
  9. +4
    -1
      deploy/charts/octopus/requirements.yaml
  10. +102
    -0
      deploy/charts/octopus/templates/_helpers.tpl
  11. +72
    -0
      deploy/charts/octopus/templates/api-doc.yaml
  12. +5
    -0
      deploy/charts/octopus/templates/base-server.yaml
  13. +86
    -0
      deploy/charts/octopus/templates/eventrouter.yaml
  14. +24
    -0
      deploy/charts/octopus/templates/ingress.yaml
  15. +10
    -1
      deploy/charts/octopus/templates/storage/initdb.yaml
  16. +16
    -1
      deploy/charts/octopus/templates/storage/pv.yaml
  17. +15
    -0
      deploy/charts/octopus/templates/storage/pvc.yaml
  18. +48
    -1
      deploy/charts/octopus/values.yaml
  19. +8
    -0
      openai-portal/src/api/trainingManager.js
  20. +139
    -33
      openai-portal/src/views/trainingManager/components/detailDialog/taskInfo.vue
  21. +1
    -1
      server/admin-server/.gitignore
  22. +0
    -2
      server/admin-server/api/v1/trainJob.proto
  23. +1
    -1
      server/admin-server/configs/config.yaml
  24. +34
    -2
      server/base-server/api/v1/develop.proto
  25. +33
    -2
      server/base-server/api/v1/trainJob.proto
  26. +6
    -1
      server/base-server/configs/config.yaml
  27. +9
    -0
      server/base-server/internal/conf/conf.proto
  28. +60
    -3
      server/base-server/internal/data/dao/develop.go
  29. +15
    -0
      server/base-server/internal/data/dao/model/develop.go
  30. +15
    -0
      server/base-server/internal/data/dao/model/train_job.go
  31. +62
    -5
      server/base-server/internal/data/dao/train_job.go
  32. +9
    -2
      server/base-server/internal/data/data.go
  33. +101
    -0
      server/base-server/internal/data/influxdb/influxdb.go
  34. +27
    -4
      server/base-server/internal/service/develop/develop.go
  35. +30
    -1
      server/base-server/internal/service/trainjob/train_job.go
  36. +4
    -0
      server/common/errors/codes.go
  37. +74
    -31
      server/common/utils/protoc.go
  38. +7
    -5
      server/go.mod
  39. +658
    -0
      server/go.sum
  40. +1
    -1
      server/openai-server/.gitignore
  41. +37
    -1
      server/openai-server/api/v1/develop.proto
  42. +37
    -2
      server/openai-server/api/v1/trainJob.proto
  43. +1
    -1
      server/openai-server/configs/config.yaml
  44. +26
    -0
      server/openai-server/internal/service/develop.go
  45. +26
    -0
      server/openai-server/internal/service/trainjob.go
  46. +0
    -3
      server/taskset/pkg/pipeline/services/kubernetes/event_handlers.go
  47. +0
    -113
      server/taskset/pkg/pipeline/services/kubernetes/logs_helper.go

+ 48
- 0
.drone.yml View File

@@ -427,3 +427,51 @@ steps:
from_secret: docker_hub_project
commands:
- make openai-portal_image_push need_latest=FALSE tag=${DRONE_TAG} docker_hub_project=$DOCKER_HUB_PROJECT docker_hub_host=$DOCKER_HUB_HOST docker_hub_userame=$DOCKER_HUB_USERNAME docker_hub_passwd=$DOCKER_HUB_PASSWD

---
kind: pipeline
name: api-doc
type: kubernetes
platform:
os: linux
arch: amd64
trigger:
event:
- tag
volumes:
- name: docker
host:
path: /var/run/
steps:
# - name: 代码检查
# image: golangci/golangci-lint:v1.40.1
# environment:
# GO111MODULE: on
# GOPROXY: https://goproxy.cn,direct
# commands:
# - make admin-server_lint

- name: 构建镜像
image: swr.cn-south-1.myhuaweicloud.com/openioctopus/docker:20.10.6-make
volumes:
- name: docker
path: /var/run/
commands:
- make api-doc_image tag=${DRONE_TAG}

- name: 镜像推送
image: swr.cn-south-1.myhuaweicloud.com/openioctopus/docker:20.10.6-make
volumes:
- name: docker
path: /var/run/
environment:
DOCKER_HUB_HOST:
from_secret: docker_hub_host
DOCKER_HUB_USERNAME:
from_secret: docker_hub_userame
DOCKER_HUB_PASSWD:
from_secret: docker_hub_passwd
DOCKER_HUB_PROJECT:
from_secret: docker_hub_project
commands:
- make api-doc_image_push need_latest=FALSE tag=${DRONE_TAG} docker_hub_project=$DOCKER_HUB_PROJECT docker_hub_host=$DOCKER_HUB_HOST docker_hub_userame=$DOCKER_HUB_USERNAME docker_hub_passwd=$DOCKER_HUB_PASSWD

+ 2
- 0
.golangci.yaml View File

@@ -71,3 +71,5 @@ run:
- resourcepool.go
- resourcespec.go
- node.go
- develop.go
- train_job.go

+ 17
- 2
Makefile View File

@@ -73,6 +73,8 @@ vc-controller_build: init
scheduler_build: init
cd ./server/taskset && go build -ldflags ${LD_FLAGS} -o ${SERVER_BINARY_DIR} ./main/scheduler

api-doc_build: init
cd ./server && go generate
# 运行
all_run: server_run

@@ -165,7 +167,7 @@ taskset_lint: lint_init
cd ./server/taskset && golangci-lint run ./...

# 构建镜像
images: base-server_image admin-server_image openai-server_image taskset_image admin-portal_image openai-portal_image
images: base-server_image admin-server_image openai-server_image taskset_image admin-portal_image openai-portal_image api-doc_image

base-server_image:
docker build --no-cache -t base-server:${RELEASE_VER} -f ./build/application/base-server/dockerfile .
@@ -193,8 +195,11 @@ admin-portal_image:
openai-portal_image:
docker build --no-cache -t openai-portal:${RELEASE_VER} -f ./build/application/openai-portal/dockerfile .

api-doc_image:
docker build --no-cache -t api-doc:${RELEASE_VER} -f ./build/application/api-doc/dockerfile .

# 镜像推送
images_push: base-server_image_push admin-server_image_push openai-server_image_push taskset_image_push admin-portal_image_push openai-portal_image_push
images_push: base-server_image_push admin-server_image_push openai-server_image_push taskset_image_push admin-portal_image_push openai-portal_image_push api-doc_image_push

image_push_init:
(echo ${DOCKER_HUB_PASSWD} | docker login ${DOCKER_HUB_HOST} -u ${DOCKER_HUB_USERNAME} --password-stdin) 1>/dev/null 2>&1
@@ -290,6 +295,16 @@ ifeq (${NEED_LATEST}, TRUE)
endif
endif

api-doc_image_push: image_push_init
docker tag api-doc:${RELEASE_VER} ${DOCKER_HUB_HOST}/${DOCKER_HUB_PROJECT}/api-doc:${RELEASE_VER}
docker push ${DOCKER_HUB_HOST}/${DOCKER_HUB_PROJECT}/api-doc:${RELEASE_VER}

ifneq (${RELEASE_VER}, latest)
ifeq (${NEED_LATEST}, TRUE)
docker tag api-doc:${RELEASE_VER} ${DOCKER_HUB_HOST}/${DOCKER_HUB_PROJECT}/api-doc:latest
docker push ${DOCKER_HUB_HOST}/${DOCKER_HUB_PROJECT}/api-doc:latest
endif
endif

# helm chart
charts: charts_build charts_push


+ 9
- 0
build/application/api-doc/dockerfile View File

@@ -0,0 +1,9 @@
from swr.cn-south-1.myhuaweicloud.com/openioctopus/kratos:v2 as builder
WORKDIR /app
COPY ./ ./

RUN make api-doc_build

FROM swaggerapi/swagger-ui:v3.52.3
COPY --from=builder /app/server/admin-server/api/v1/swagger.json /usr/share/nginx/html/admin.swagger.json
COPY --from=builder /app/server/openai-server/api/v1/swagger.json /usr/share/nginx/html/openai.swagger.json

+ 2
- 1
build/basic_environment/kratos-v2/dockerfile View File

@@ -14,4 +14,5 @@ RUN go get -u google.golang.org/protobuf/cmd/protoc-gen-go@v1.26.0 \
&& go get -u github.com/envoyproxy/protoc-gen-validate@v0.5.1 \
&& go get -u github.com/go-kratos/kratos/cmd/protoc-gen-go-http/v2@46acad3 \
&& go get -u github.com/go-kratos/kratos/cmd/protoc-gen-go-errors/v2@46acad3 \
&& go get -u github.com/go-kratos/kratos/v2@v2.0.0-beta3
&& go get -u github.com/go-kratos/kratos/v2@v2.0.0-beta3 \
&& go get -u github.com/grpc-ecosystem/grpc-gateway/v2/protoc-gen-openapiv2@v2.6.0

+ 3
- 0
deploy/charts/octopus/Chart.yaml View File

@@ -35,6 +35,9 @@ dependencies:
- name: nginx-ingress-controller
version: 7.6.9
repository: https://charts.bitnami.com/bitnami
- name: influxdb
version: 4.10.0
repository: https://helm.influxdata.com/

home: https://octopus.openi.org.cn/
icon: https://git.openi.org.cn/OpenI/octopus/media/branch/master/logo.png


BIN
deploy/charts/octopus/charts/influxdb-2.3.9.tgz View File


BIN
deploy/charts/octopus/charts/influxdb-4.10.0.tgz View File


+ 4
- 1
deploy/charts/octopus/requirements.yaml View File

@@ -10,4 +10,7 @@ dependencies:
repository: https://charts.bitnami.com/bitnami
- name: nginx-ingress-controller
version: 7.6.9
repository: https://charts.bitnami.com/bitnami
repository: https://charts.bitnami.com/bitnami
- name: influxdb
version: 4.10.0
repository: https://helm.influxdata.com/

+ 102
- 0
deploy/charts/octopus/templates/_helpers.tpl View File

@@ -695,6 +695,52 @@ octopus.pcl.ac.cn/resource: {{ .Values.common.resourceTagValuePrefix }}_{{ inclu
{{- printf "%s:%s" (include "redis.serviceName" .) .Values.redis.master.service.port -}}
{{- end -}}

{{/******************influxdb******************/}}

{{- define "influxdb.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{- define "influxdb.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- printf "%s-influxdb" .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{- define "influxdb.serviceName" -}}
{{- printf "%s" (include "influxdb.fullname" .) -}}
{{- end -}}

{{- define "influxdb.serviceAddr" -}}
{{- printf "%s:8086" (include "influxdb.serviceName" .) -}}
{{- end -}}

{{/******************eventrouter******************/}}

{{- define "eventrouter.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{- define "eventrouter.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- printf "%s-eventrouter" .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/****************** Prometheus ******************/}}

{{- define "prometheus.name" -}}
@@ -778,4 +824,60 @@ app.kubernetes.io/part-of: {{ include "grafana.name" . }}

{{- define "prometheus.address" -}}
{{- printf "http://%s.%s:%s" (include "prometheus.fullname" .) .Release.Namespace .Values.grafana.prometheus.port -}}
{{- end -}}

{{/******************api-doc******************/}}

{{- define "apidoc.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{- define "apidoc.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- printf "%s-apidoc" .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{- define "apidoc.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{- define "apidoc.core-labels" -}}
helm.sh/chart: {{ include "apidoc.chart" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end -}}

{{- define "apidoc.select-labels" -}}
app.kubernetes.io/name: {{ include "apidoc.name" . }}
app.kubernetes.io/instance: {{ include "apidoc.fullname" . }}
app.kubernetes.io/part-of: {{ include "apidoc.name" . }}
{{- end -}}

{{- define "apidoc.resource-labels" -}}
octopus.pcl.ac.cn/resource: {{ .Values.common.resourceTagValuePrefix }}_{{ include "apidoc.fullname" . }}_{{ default .Chart.AppVersion .Values.apidoc.image.tag }}
{{- end -}}


{{- define "apidoc.labels" -}}
{{ include "apidoc.core-labels" . }}
{{ include "apidoc.select-labels" . }}
{{ include "apidoc.resource-labels" . }}
{{- end -}}

{{- define "apidoc.port" -}}
{{- printf "8080" -}}
{{- end -}}

{{- define "apidoc.targetPort" -}}
{{- printf "8080" -}}
{{- end -}}

+ 72
- 0
deploy/charts/octopus/templates/api-doc.yaml View File

@@ -0,0 +1,72 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ template "apidoc.fullname" . }}

---

apiVersion: v1
kind: Service
metadata:
name: {{ template "apidoc.fullname" . }}
labels:
{{ include "apidoc.labels" . | indent 4 }}
spec:
ports:
- name: http
protocol: TCP
port: {{ template "apidoc.port" . }}
targetPort: {{ template "apidoc.port" . }}
selector:
{{ include "apidoc.select-labels" . | indent 8 }}


---

apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ template "apidoc.fullname" . }}
spec:
selector:
matchLabels:
{{ include "apidoc.select-labels" . | indent 8 }}
replicas: {{ .Values.apidoc.replicas }}
template:
metadata:
labels:
{{ include "apidoc.labels" . | indent 8 }}
spec:
serviceAccountName: {{ template "apidoc.fullname" . }}
volumes:
- name: localtime
hostPath:
path: /etc/localtime
containers:
- name: {{ .Chart.Name }}
image: "{{ .Values.global.image.repository.address }}{{ .Values.global.image.repository.pathname }}/{{ .Values.apidoc.image.name }}:{{ default .Chart.AppVersion .Values.apidoc.image.tag }}"
imagePullPolicy: {{ .Values.global.image.pullPolicy }}
volumeMounts:
- name: localtime
mountPath: /etc/localtime
env:
- name: URLS
value: "[{url:\"{{ .Values.ingress.apidocPath }}/admin.swagger.json\",name:\"admin\"},{url:\"{{ .Values.ingress.apidocPath }}/openai.swagger.json\",name:\"openai\"}]"
ports:
- name: http
containerPort: {{ template "apidoc.targetPort" . }}
protocol: TCP
resources:
{{ toYaml .Values.resources | indent 10 }}
{{- with .Values.global.nodeSelector }}
nodeSelector:
{{ toYaml . | indent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{ toYaml . | indent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{ toYaml . | indent 8 }}
{{- end }}

+ 5
- 0
deploy/charts/octopus/templates/base-server.yaml View File

@@ -131,6 +131,11 @@ data:
addr: {{ include "redis.serviceAddr" . }}
username: {{ .Values.baseserver.data.redis.username }}
password: {{ .Values.baseserver.data.redis.password }}
influxdb:
addr: {{ include "influxdb.serviceAddr" . }}
username: {{ .Values.influxdb.setDefaultUser.user.username }}
password: {{ .Values.influxdb.setDefaultUser.user.password }}
database: octopus
service:
nfsRootPath: /octopus-storage
baseServerAddr: {{ include "baseserver.httpServiceAddr" . }}


+ 86
- 0
deploy/charts/octopus/templates/eventrouter.yaml View File

@@ -0,0 +1,86 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: eventrouter
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: eventrouter
rules:
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: eventrouter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: eventrouter
subjects:
- kind: ServiceAccount
name: eventrouter
namespace: {{ .Release.Namespace }}
---
apiVersion: v1
data:
config.json: |-
{
"sink": "influxdb",
"influxdbHost": "{{ include "influxdb.serviceAddr" . }}",
"influxdbUsername": "{{ .Values.influxdb.setDefaultUser.user.username }}",
"influxdbPassword":"{{ .Values.influxdb.setDefaultUser.user.password }}",
"influxdbName": "octopus",
"influxdbWithFields": true
}
kind: ConfigMap
metadata:
name: eventrouter-cm
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ template "eventrouter.fullname" . }}
labels:
app: eventrouter
spec:
replicas: 1
selector:
matchLabels:
app: eventrouter
template:
metadata:
labels:
app: eventrouter
tier: control-plane-addons
spec:
containers:
- name: kube-eventrouter
image: swr.cn-south-1.myhuaweicloud.com/openioctopus/heptio-images/eventrouter:v0.3
imagePullPolicy: IfNotPresent
volumeMounts:
- name: config-volume
mountPath: /etc/eventrouter
serviceAccount: eventrouter
volumes:
- name: config-volume
configMap:
name: eventrouter-cm
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]
- weight: 100
preference:
matchExpressions:
- key: beta.kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]

+ 24
- 0
deploy/charts/octopus/templates/ingress.yaml View File

@@ -4,6 +4,7 @@
{{- $openaiserverName := include "openaiserver.fullname" . -}}
{{- $adminportalName := include "adminportal.fullname" . -}}
{{- $openaiportalName := include "openaiportal.fullname" . -}}
{{- $apidocName := include "apidoc.fullname" . -}}
{{- $adminserverPath := .Values.ingress.adminserverPath -}}
{{- $openaiserverPath := .Values.ingress.openaiserverPath -}}
{{- $adminportalPath := trimSuffix "/" .Values.ingress.adminportalPath -}}
@@ -12,6 +13,7 @@
{{- $minioWebPath := .Values.ingress.minioPath.web -}}
{{- $minioApiPath := .Values.ingress.minioPath.api -}}
{{- $loggerHttpdPath := .Values.ingress.loggerHttpdPath -}}
{{- $apidocPath := .Values.ingress.apidocPath -}}
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
@@ -136,6 +138,28 @@ spec:

---

apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: {{ $fullName }}-apidoc
labels:
{{ include "octopus.labels" . | indent 4 }}
annotations:
kubernetes.io/ingress.class: "nginx"
kubernetes.io/ingress.allow-http: "true"
ingress.kubernetes.io/ssl-redirect: "false"
nginx.ingress.kubernetes.io/rewrite-target: /$1
spec:
rules:
- http:
paths:
- backend:
serviceName: {{ $apidocName }}
servicePort: {{ template "apidoc.port" . }}
path: {{ $apidocPath }}/(.*)

---

apiVersion: v1
kind: ConfigMap
metadata:


+ 10
- 1
deploy/charts/octopus/templates/storage/initdb.yaml View File

@@ -5,4 +5,13 @@ metadata:
data:
initdb.sql: |
CREATE DATABASE IF NOT EXISTS octopus DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
CREATE DATABASE IF NOT EXISTS core DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
CREATE DATABASE IF NOT EXISTS core DEFAULT CHARSET utf8 COLLATE utf8_general_ci;
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "octopus.name" . }}-influxdb
data:
custom-init-scripts.sh: |
influx -execute "CREATE DATABASE octopus" -username octopus -password octopus
influx -execute "CREATE RETENTION POLICY \"default\" ON octopus DURATION 0s REPLICATION 1 DEFAULT" -username {{ .Values.influxdb.setDefaultUser.user.username }} -password {{ .Values.influxdb.setDefaultUser.user.password }}

+ 16
- 1
deploy/charts/octopus/templates/storage/pv.yaml View File

@@ -54,4 +54,19 @@ spec:
storage: {{ .Values.pv.logger.requests }}
accessModes:
- ReadWriteMany
{{ toYaml .Values.pv.logger.storageType | indent 2 }}
{{ toYaml .Values.pv.logger.storageType | indent 2 }}

---

apiVersion: v1
kind: PersistentVolume
metadata:
name: octopus-influxdb-pv
labels:
pv: influxdb-pv
spec:
capacity:
storage: {{ .Values.pv.influxdb.requests }}
accessModes:
- ReadWriteMany
{{ toYaml .Values.pv.influxdb.storageType | indent 2 }}

+ 15
- 0
deploy/charts/octopus/templates/storage/pvc.yaml View File

@@ -59,6 +59,21 @@ spec:
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: octopus-influxdb-pvc
spec:
accessModes: ["ReadWriteMany"]
resources:
requests:
storage: {{ .Values.pvc.influxdb.requests }}
selector:
matchLabels:
pv: influxdb-pv

---

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: octopus-logger-pvc
spec:
accessModes: ["ReadWriteMany"]


+ 48
- 1
deploy/charts/octopus/values.yaml View File

@@ -29,8 +29,10 @@ ingress:
minioPath:
web: /minio
api: /oss
apidocPath: /apidoc
tls: []


pvc:
minio:
requests: 100Gi
@@ -38,6 +40,8 @@ pvc:
requests: 100Gi
redis:
requests: 100Gi
influxdb:
requests: 100Gi
logger:
requests: 100Gi

@@ -60,6 +64,12 @@ pv:
nfs:
server: 192.168.203.72
path: /data/datasets/data/redis
influxdb:
requests: 100Gi
storageType:
nfs:
server: 192.168.203.72
path: /data/datasets/data/influxdb
logger:
requests: 100Gi
storageType:
@@ -317,6 +327,35 @@ nginx-ingress-controller:
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]

# influxdb
influxdb:
replicaCount: 1
initScripts:
enabled: true
setDefaultUser:
user:
username: "octopus"
password: "octopus"
persistence:
enabled: true
existingClaim: "octopus-influxdb-pvc"
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
preference:
matchExpressions:
- key: kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]
- weight: 100
preference:
matchExpressions:
- key: beta.kubernetes.io/arch
operator: In
values: ["amd64", "x64", "x86-64", "x86_64"]

# grafana
grafana:
nameOverride: "grafana"
@@ -361,4 +400,12 @@ prometheus:
targetPort: "9090"
resources: {}
affinity: {}
nodeSelector: {}
nodeSelector: {}

# apidoc
apidoc:
image:
pullPolicy: ""
address: ""
pathname: ""
name: "api-doc"

+ 8
- 0
openai-portal/src/api/trainingManager.js View File

@@ -57,6 +57,14 @@ export function getTempalteDetail(params) {
method: 'get'
})
}
// 训任务运行信息
export function getTempalteInfo(params) {
return request({
url: `/v1/trainmanage/trainjobevent`,
method: 'get',
params
})
}
// 任务模板接口
export function getTemplate(params) {
const conditions = []


+ 139
- 33
openai-portal/src/views/trainingManager/components/detailDialog/taskInfo.vue View File

@@ -1,10 +1,65 @@
<template>
<div>
<div v-html="initInfo"></div>
<el-row>
<el-col :span="12">
<div>任务名称:<span>{{ data.name }}</span></div>
</el-col>
<el-col :span="12">
<div>是否分布式:<span>{{ data.isDistributed?'是':'否' }}</span></div>
</el-col>
</el-row>
<el-row>
<el-col :span="12">
<el-form ref="ruleForm" :model="ruleForm">
<el-form-item prop="subTaskItem">
<div style="font-size: 15px">子任务名:
<el-select
v-model="ruleForm.subTaskItem"
value-key="label"
placeholder="请选择"
@change="selectedSubTaskOption"
>
<el-option
v-for="item in subTaskOptions"
:key="item.label"
:label="item.label"
:value="item"
/>
</el-select>
</div>
</el-form-item>
</el-form>
</el-col>
</el-row>

<div>
<el-input
v-if="showInfo"
v-model="subTaskInfo"
type="textarea"
:readonly="true"
:autosize="true"
/>
</div>

<div class="block">
<el-pagination
v-if="showInfo"
:current-page="pageIndex"
:page-sizes="[10, 20, 50, 80]"
:page-size="pageSize"
layout="total, sizes, prev, pager, next, jumper"
:total="total"
@size-change="handleSizeChange"
@current-change="handleCurrentChange"
/>
</div>
</div>
</template>

<script>
import { getTempalteInfo } from '@/api/trainingManager'
import { getErrorMsg } from '@/error/index'
export default {
name: "TaskInfo",
props: {
@@ -14,50 +69,101 @@
}
},
data() {
return {
initInfo: ""
}
return {
data: {},
initInfo: "",
subTaskOptions: [],
ruleForm: {
subTaskItem: ""
},
subTaskInfo: "",
pageIndex: 1,
pageSize: 10,
total: 0,
showInfo: false
}
},
created() {
const taskInfoString = this.row.initInfo ? this.row.initInfo.replace(/\n/g, "<br>") : ''
const taskInfoData = JSON.parse(taskInfoString)
for (const pid in taskInfoData['podEvents']) {
const eventList = taskInfoData['podEvents'][pid]
const roleName = taskInfoData['podRoleName'][pid]
if (roleName == "") {
continue
this.data = JSON.parse(JSON.stringify(this.row))
for (let i = 0; i < this.row.config.length; i++) {
for (let j = 0; j < this.row.config[i].taskNumber; j++) {
this.subTaskOptions.push({
label: this.row.config[i].replicaStates[j].key,
taskIndex: i + 1,
replicaIndex: j + 1
})
}
let message = ""
for (const key in eventList) {
const event = eventList[key]
if (event['reason'] == "" && event['message'] == "") {
continue
}
message += "[" + event['reason'] + "]" + "<br>"
message += event['message'] + "<br><br>"
}
},
methods: {
// 错误码
getErrorMsg(code) {
return getErrorMsg(code)
},
selectedSubTaskOption() {
const param = {
id: this.row.id,
pageIndex: this.pageIndex,
pageSize: this.pageSize,
taskIndex: this.ruleForm.subTaskItem.taskIndex,
replicaIndex: this.ruleForm.subTaskItem.replicaIndex
}
for (const key in taskInfoData['extras']) {
const event = taskInfoData['extras'][key]
if (event['reason'] == "" && event['message'] == "") {
continue
getTempalteInfo(param).then(response => {
if (response.success) {
this.showInfo = response.payload.jobEvents.length
this.total = response.payload.totalSize
let infoMessage = ""
response.payload.jobEvents.forEach(function(element) {
const title = element.reason
const message = element.message
infoMessage += "\n" + "[" + title + "]"
infoMessage += "\n" + "[" + message + "]" + "\n"
})
this.subTaskInfo = infoMessage
} else {
this.$message({
message: this.getErrorMsg(response.error.subcode),
type: 'warning'
});
}
message += "[" + event['reason'] + "]" + "<br>"
message += event['message'] + "<br><br>"
}
message += "<br>"
tempTaskInfoData[roleName] = message
}).catch(err => {
console.log("err:", err)
this.$message({
message: "未知错误",
type: 'warning'
});
});
},
handleSizeChange(val) {
this.pageSize = val
this.selectedSubTaskOption()
},
handleCurrentChange(val) {
this.pageIndex = val
this.selectedSubTaskOption()
}
let obj = {}
Object.keys(tempTaskInfoData).sort().forEach(function(key) {
obj[key] = tempTaskInfoData[key];
});
this.initInfo = obj
}
}
</script>

<style lang="scss" scoped>
.el-col {
margin: 10px 0 20px 0;
font-size: 15px;
font-weight: 800;

span {
font-weight: 400;
margin-left: 20px
}
}

.select {
margin-left: 5px;
}

.block {
float: right;
margin: 20px;
}
</style>

+ 1
- 1
server/admin-server/.gitignore View File

@@ -38,4 +38,4 @@ bin/
# pb
*.pb.go
*.pb.validate.go
*.swagger.json
*swagger.json

+ 0
- 2
server/admin-server/api/v1/trainJob.proto View File

@@ -131,8 +131,6 @@ message TrainJob {
string imageVersion = 23;
//启动时间
int64 startedAt = 24;
//启动信息
string initInfo = 25;
}

message TrainJobListRequest {


+ 1
- 1
server/admin-server/configs/config.yaml View File

@@ -12,7 +12,7 @@ data:
baseServerAddr: dns:///127.0.0.1:9001
baseServerRequestTimeout: 30s
redis:
addr: 192.168.202.73:31570
addr: 192.168.202.73:30331
username:
password: abcde
service:

+ 34
- 2
server/base-server/api/v1/develop.proto View File

@@ -21,6 +21,8 @@ service Develop {
rpc ListNotebook (ListNotebookRequest) returns (ListNotebookReply);
// 查询notebook详情
rpc GetNotebook (GetNotebookRequest) returns (GetNotebookReply);
//获取任务事件列表
rpc GetNotebookEventList (NotebookEventListRequest) returns (NotebookEventListReply);
}

message CreateNotebookRequest {
@@ -98,7 +100,6 @@ message Notebook {
string datasetVersion = 18;
string datasetName = 19;
uint32 resourceSpecPrice = 20;
string initInfo = 21;
}

message ListNotebookReply {
@@ -112,4 +113,35 @@ message GetNotebookRequest {

message GetNotebookReply {
Notebook notebook = 1;
}
}

message NotebookEventListRequest {
// 页码,从1开始
int64 pageIndex = 1[(validate.rules).int64 = {gte:1}];
// 页大小,最小1条,最大100条
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
}

message NotebookEventListReply {
//查询结果总数
int64 totalSize = 1;
//任务事件
repeated NotebookEvent notebookEvents = 2;
}

message NotebookEvent{
//发生时间
string timestamp = 1;
//副本名称
string name = 2;
//原因
string reason = 3;
//消息
string message = 4;
}

+ 33
- 2
server/base-server/api/v1/trainJob.proto View File

@@ -29,7 +29,8 @@ service TrainJobService {
rpc DeleteJobTemplate (DeleteJobTemplateRequest) returns (DeleteJobTemplateReply);
//获取任务模板列表
rpc ListJobTemplate (TrainJobTemplateListRequest) returns (TrainJobTemplateListReply);

//获取任务事件列表
rpc GetJobEventList (JobEventListRequest) returns (JobEventListReply);
}


@@ -216,7 +217,6 @@ message TrainJob{
string imageName = 20;
string dataSetName = 21;
int64 startedAt = 22;
string initInfo = 23;
}


@@ -227,3 +227,34 @@ message TrainJobInfoRequest {
message TrainJobInfoReply{
TrainJob trainJob = 1;
}

message JobEventListRequest {
// 页码,从1开始
int64 pageIndex = 1[(validate.rules).int64 = {gte:1}];
// 页大小,最小1条,最大100条
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
}

message JobEventListReply {
//查询结果总数
int64 totalSize = 1;
//任务事件
repeated JobEvent jobEvents = 2;
}

message JobEvent{
//发生时间
string timestamp = 1;
//副本名称
string name = 2;
//原因
string reason = 3;
//消息
string message = 4;
}

+ 6
- 1
server/base-server/configs/config.yaml View File

@@ -16,7 +16,7 @@ data:
source: root:root@tcp(192.168.202.73:30336)/octopus?charset=utf8&parseTime=True&loc=Local
kubernetes:
masterUrl: https://192.168.202.73:6443/
configPath: /home/hackmong/.kube/config
configPath: /home/hackmong/openi/dev/73config
pipeline:
baseUrl: http://127.0.0.1:8080
token: KLtmMug9BDvvRjlg
@@ -42,6 +42,11 @@ data:
addr: 192.168.202.73:31663
username:
password: abcde
influxdb:
addr: 192.168.202.73:8086
username: octopus
password: octopus
database: octopus
service:
baseServerAddr: http://127.0.0.1:8001
dockerDatasetPath: /dataset


+ 9
- 0
server/base-server/internal/conf/conf.proto View File

@@ -79,6 +79,14 @@ message Redis {
string password = 3;
}

message Influxdb {
string addr = 1;
string username = 2;
string password = 3;
string Database = 4;
string Precision = 5;
}

message Data {
Database database = 1;
Pipeline pipeline = 2;
@@ -86,6 +94,7 @@ message Data {
Minio minio = 4;
Harbor harbor = 5;
Redis redis = 6;
Influxdb influxdb = 7;
}

message Develop {


+ 60
- 3
server/base-server/internal/data/dao/develop.go View File

@@ -2,9 +2,11 @@ package dao

import (
"context"
"encoding/json"
stderrors "errors"
"fmt"
"server/base-server/internal/data/dao/model"
"server/base-server/internal/data/influxdb"
"server/common/errors"
"server/common/transaction"
"server/common/utils"
@@ -29,19 +31,23 @@ type DevelopDao interface {
UpdateNotebookJobSelective(ctx context.Context, notebookJob *model.NotebookJob) error
DeleteNotebookJobByNbId(ctx context.Context, notebookId string) error
ListNotebookJob(ctx context.Context, query *model.NotebookJobQuery) ([]*model.NotebookJob, error)
//获取Notebook事件
GetNotebookEvents(notebookEventQuery *model.NotebookEventQuery) ([]*model.NotebookEvent, int64, error)
}

type developDao struct {
log *log.Helper
db transaction.GetDB
log *log.Helper
db transaction.GetDB
influxdb influxdb.Influxdb
}

func NewDevelopDao(db *gorm.DB, logger log.Logger) DevelopDao {
func NewDevelopDao(db *gorm.DB, influxdb influxdb.Influxdb, logger log.Logger) DevelopDao {
return &developDao{
log: log.NewHelper("DevelopDao", logger),
db: func(ctx context.Context) *gorm.DB {
return transaction.GetDBFromCtx(ctx, db)
},
influxdb: influxdb,
}
}

@@ -298,3 +304,54 @@ func (d *developDao) ListNotebookJob(ctx context.Context, query *model.NotebookJ

return notebookJobs, nil
}

func (d *developDao) GetNotebookEvents(notebookEventQuery *model.NotebookEventQuery) ([]*model.NotebookEvent, int64, error) {

keyName := "object_name"
keyReason := "reason"
keyMessage := "message"

PageIndex := notebookEventQuery.PageIndex
PageSize := notebookEventQuery.PageSize
TaskIndex := notebookEventQuery.TaskIndex
ReplicaIndex := notebookEventQuery.ReplicaIndex
events := make([]*model.NotebookEvent, 0)

objectName := fmt.Sprintf("%s-task%d-%d", notebookEventQuery.Id, TaskIndex-1, ReplicaIndex-1)

countQuery := fmt.Sprintf("SELECT COUNT(%s) FROM octopus..events where object_name = '%s'", keyMessage, objectName)
res, err := d.influxdb.Query(countQuery)

if err != nil {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

if len(res) == 0 || len(res[0].Series) == 0 || len(res[0].Series[0].Values) == 0 || len(res[0].Series[0].Values[0]) < 2 {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

totalSize, err := res[0].Series[0].Values[0][1].(json.Number).Int64()
if err != nil {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

query := fmt.Sprintf("select %s, %s, %s from octopus..events where object_name = '%s' and kind = 'Pod' LIMIT %d OFFSET %d",
keyName, keyReason, keyMessage, objectName, PageSize, (PageIndex-1)*PageSize)
res, err = d.influxdb.Query(query)

if err != nil {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

for _, row := range res[0].Series[0].Values {

event := &model.NotebookEvent{}
event.Timestamp = row[0].(string)
event.Name = row[1].(string)
event.Reason = row[2].(string)
event.Message = row[3].(string)
events = append(events, event)
}

return events, totalSize, nil
}

+ 15
- 0
server/base-server/internal/data/dao/model/develop.go View File

@@ -80,3 +80,18 @@ type NotebookJobQuery struct {
PayStatus api.BillingPayRecordStatus
Ids []string
}

type NotebookEvent struct {
Timestamp string
Name string
Reason string
Message string
}

type NotebookEventQuery struct {
PageIndex int
PageSize int
Id string
TaskIndex int
ReplicaIndex int
}

+ 15
- 0
server/base-server/internal/data/dao/model/train_job.go View File

@@ -152,3 +152,18 @@ type TrainJobTemPlateListQuery struct {
WorkspaceId string
Ids []string
}

type TrainJobEvent struct {
Timestamp string
Name string
Reason string
Message string
}

type JobEventQuery struct {
PageIndex int
PageSize int
Id string
TaskIndex int
ReplicaIndex int
}

+ 62
- 5
server/base-server/internal/data/dao/train_job.go View File

@@ -2,9 +2,11 @@ package dao

import (
"context"
"encoding/json"
stderrors "errors"
"fmt"
"server/base-server/internal/data/dao/model"
"server/base-server/internal/data/influxdb"
"server/common/errors"
"server/common/utils"
"time"
@@ -42,17 +44,21 @@ type TrainJobDao interface {
UpdateTrainJobTemplate(ctx context.Context, trainJobTemplate *model.TrainJobTemplate) error
//网关层删除任务模板(软删除)
DeleteTrainJobTemplate(userId string, ids []string) error
//获取训练任务事件
GetTrainJobEvents(jobEventQuery *model.JobEventQuery) ([]*model.TrainJobEvent, int64, error)
}

type trainJobDao struct {
log *log.Helper
db *gorm.DB
log *log.Helper
db *gorm.DB
influxdb influxdb.Influxdb
}

func NewTrainJobDao(db *gorm.DB, logger log.Logger) TrainJobDao {
func NewTrainJobDao(db *gorm.DB, influxdb influxdb.Influxdb, logger log.Logger) TrainJobDao {
return &trainJobDao{
log: log.NewHelper("TrainJobDao", logger),
db: db,
log: log.NewHelper("TrainJobDao", logger),
db: db,
influxdb: influxdb,
}
}

@@ -347,3 +353,54 @@ func (d *trainJobDao) DeleteTrainJobTemplate(userId string, ids []string) error
}
return nil
}

func (d *trainJobDao) GetTrainJobEvents(jobEventQuery *model.JobEventQuery) ([]*model.TrainJobEvent, int64, error) {

keyName := "object_name"
keyReason := "reason"
keyMessage := "message"

PageIndex := jobEventQuery.PageIndex
PageSize := jobEventQuery.PageSize
TaskIndex := jobEventQuery.TaskIndex
ReplicaIndex := jobEventQuery.ReplicaIndex
events := make([]*model.TrainJobEvent, 0)

objectName := fmt.Sprintf("%s-task%d-%d", jobEventQuery.Id, TaskIndex-1, ReplicaIndex-1)

countQuery := fmt.Sprintf("SELECT COUNT(%s) FROM octopus..events where object_name = '%s'", keyMessage, objectName)
res, err := d.influxdb.Query(countQuery)

if err != nil {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

if len(res) == 0 || len(res[0].Series) == 0 || len(res[0].Series[0].Values) == 0 || len(res[0].Series[0].Values[0]) < 2 {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

totalSize, err := res[0].Series[0].Values[0][1].(json.Number).Int64()
if err != nil {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

query := fmt.Sprintf("select %s, %s, %s from octopus..events where object_name = '%s' and kind = 'Pod' LIMIT %d OFFSET %d",
keyName, keyReason, keyMessage, objectName, PageSize, (PageIndex-1)*PageSize)
res, err = d.influxdb.Query(query)

if err != nil {
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

for _, row := range res[0].Series[0].Values {

event := &model.TrainJobEvent{}
event.Timestamp = row[0].(string)
event.Name = row[1].(string)
event.Reason = row[2].(string)
event.Message = row[3].(string)
events = append(events, event)
}

return events, totalSize, nil
}

+ 9
- 2
server/base-server/internal/data/data.go View File

@@ -7,6 +7,7 @@ import (
"server/base-server/internal/data/dao/algorithm_dao"
"server/base-server/internal/data/dao/model"
"server/base-server/internal/data/dao/model/resources"
"server/base-server/internal/data/influxdb"
"server/base-server/internal/data/minio"
"server/base-server/internal/data/pipeline"
"server/base-server/internal/data/redis"
@@ -36,6 +37,7 @@ type Data struct {
Minio minio.Minio
Registry registry.ArtifactRegistry
Redis redis.Redis
Influxdb influxdb.Influxdb
}

func NewData(confData *conf.Data, logger log.Logger) (*Data, func(), error) {
@@ -46,17 +48,22 @@ func NewData(confData *conf.Data, logger log.Logger) (*Data, func(), error) {
return nil, nil, err
}

influxdb, err := influxdb.NewInfluxdb(confData)
if err != nil {
return nil, nil, err
}

d.UserDao = dao.NewUserDao(db, logger)
d.AdminUserDao = dao.NewAdminUserDao(db, logger)
d.AlgorithmDao = algorithm_dao.NewAlgorithmDao(db, logger)
d.ResourceDao = dao.NewResourceDao(db, logger)
d.ResourceSpecDao = dao.NewResourceSpecDao(db, logger)
d.DevelopDao = dao.NewDevelopDao(db, logger)
d.DevelopDao = dao.NewDevelopDao(db, influxdb, logger)
d.ModelDao = dao.NewModelDao(db, logger)
d.DatasetDao = dao.NewDatasetDao(db, logger)
d.WorkspaceDao = dao.NewWorkspaceDao(db, logger)
d.ImageDao = dao.NewImageDao(db, logger)
d.TrainJobDao = dao.NewTrainJobDao(db, logger)
d.TrainJobDao = dao.NewTrainJobDao(db, influxdb, logger)
d.BillingDao = dao.NewBillingDao(db, logger)
d.Pipeline = pipeline.NewPipeline(confData, logger)
d.Cluster = cluster.NewCluster(confData, logger)


+ 101
- 0
server/base-server/internal/data/influxdb/influxdb.go View File

@@ -0,0 +1,101 @@
package influxdb

import (
"fmt"
"server/base-server/internal/conf"
"server/common/errors"
"time"

"net/url"

influxdbClient "github.com/influxdata/influxdb/client"
)

type Influxdb interface {
//查询
Query(cmd string) (res []influxdbClient.Result, err error)
//写入
Write(measurement string, tags map[string]string, fields map[string]interface{}) (err error)
}

type influxbd struct {
conf *conf.Data
client *influxdbClient.Client
}

func NewInfluxdb(conf *conf.Data) (db Influxdb, err error) {

url := &url.URL{
Scheme: "http",
Host: conf.Influxdb.Addr,
}

iConfig := &influxdbClient.Config{
URL: *url,
Username: conf.Influxdb.Username,
Password: conf.Influxdb.Password,
}

client, err := influxdbClient.NewClient(*iConfig)
if err != nil {
err = errors.Errorf(err, errors.ErroInfluxdbInitFailed)
return nil, err
}

if _, _, err := client.Ping(); err != nil {
err = fmt.Errorf("failed to ping influxDB server at %q - %v", conf.Influxdb.Addr, err)
return nil, errors.Errorf(err, errors.ErroInfluxdbInitFailed)
}

if err != nil {
return nil, errors.Errorf(err, errors.ErroInfluxdbInitFailed)
}

influxdb := &influxbd{
conf: conf,
client: client,
}

return influxdb, nil
}

func (i *influxbd) Query(cmd string) (res []influxdbClient.Result, err error) {

q := influxdbClient.Query{
Command: cmd,
Database: i.conf.Influxdb.Database,
}
if response, err := i.client.Query(q); err == nil {
if response.Error() != nil {
return res, response.Error()
}
res = response.Results
} else {
return res, err
}
return res, nil
}

func (i *influxbd) Write(measurement string, tags map[string]string, fields map[string]interface{}) (err error) {

point := influxdbClient.Point{
Measurement: measurement,
Time: time.Now().UTC(),
Fields: fields,
Tags: tags,
}

dataPoints := make([]influxdbClient.Point, 0, 10)
dataPoints = append(dataPoints, point)

batchPoints := influxdbClient.BatchPoints{
Points: dataPoints,
Database: i.conf.Influxdb.Database,
RetentionPolicy: "default",
}

if _, err := i.client.Write(batchPoints); err != nil {
return err
}
return nil
}

+ 27
- 4
server/base-server/internal/service/develop/develop.go View File

@@ -796,12 +796,35 @@ func (s *developService) GetNotebook(ctx context.Context, req *api.GetNotebookRe
notebook.CreatedAt = notebookTbl.CreatedAt.Unix()
notebook.UpdatedAt = notebookTbl.UpdatedAt.Unix()

//pipeline获取job最新任务信息
info, err := s.data.Pipeline.GetJobDetail(ctx, notebookTbl.NotebookJobId)
return &api.GetNotebookReply{Notebook: notebook}, nil
}

func (s *developService) GetNotebookEventList(ctx context.Context, req *api.NotebookEventListRequest) (*api.NotebookEventListReply, error) {

query := &model.NotebookEventQuery{}
err := copier.Copy(query, req)
if err != nil {
return nil, err
}
notebook.InitInfo = info.Job.ExitDiagnostics

return &api.GetNotebookReply{Notebook: notebook}, nil
events, totalSize, err := s.data.DevelopDao.GetNotebookEvents(query)
if err != nil {
return nil, err
}

notebookEvents := make([]*api.NotebookEvent, 0)

for _, value := range events {
event := &api.NotebookEvent{}
event.Timestamp = value.Timestamp
event.Name = value.Name
event.Reason = value.Reason
event.Message = value.Message
notebookEvents = append(notebookEvents, event)
}

return &api.NotebookEventListReply{
TotalSize: totalSize,
NotebookEvents: notebookEvents,
}, nil
}

+ 30
- 1
server/base-server/internal/service/trainjob/train_job.go View File

@@ -701,7 +701,6 @@ func (s *trainJobService) GetTrainJobInfo(ctx context.Context, req *api.TrainJob
if err != nil {
return nil, err
}
trainJobDetail.InitInfo = info.Job.ExitDiagnostics
for index, config := range trainJobDetail.Config {
replyStates := make([]*api.ReplicaState, 0)
for ri := 0; ri < int(config.TaskNumber); ri++ {
@@ -992,3 +991,33 @@ func (s *trainJobService) PipelineCallback(ctx context.Context, req *common.Pipe

return common.PipeLineCallbackOK
}

func (s *trainJobService) GetJobEventList(ctx context.Context, req *api.JobEventListRequest) (*api.JobEventListReply, error) {

query := &model.JobEventQuery{}
err := copier.Copy(query, req)
if err != nil {
return nil, err
}

events, totalSize, err := s.data.TrainJobDao.GetTrainJobEvents(query)
if err != nil {
return nil, err
}

jobEvents := make([]*api.JobEvent, 0)

for _, value := range events {
event := &api.JobEvent{}
event.Timestamp = value.Timestamp
event.Name = value.Name
event.Reason = value.Reason
event.Message = value.Message
jobEvents = append(jobEvents, event)
}

return &api.JobEventListReply{
TotalSize: totalSize,
JobEvents: jobEvents,
}, nil
}

+ 4
- 0
server/common/errors/codes.go View File

@@ -60,6 +60,10 @@ const (
ErroRedisHGetFailed = 10102 // redisHGet失败
ErroRedisHDelFailed = 10103 // redisHDel失败
ErrorRedisLockObtainFailed = 10104 // redis锁获取失败
// influxdb操作相关错误
ErroInfluxdbInitFailed = 10200 // influxdb初始化失败
ErroInfluxdbFindFailed = 10201 // influxdb列表查询失败
ErroInfluxdbWriteFailed = 10202 // influxdb插入失败

/* 11001~12000 资源管理错误*/
ErrorDeleteResourcePool = 11001 // 删除资源池失败


+ 74
- 31
server/common/utils/protoc.go View File

@@ -9,6 +9,8 @@ import (
"os/exec"
"path/filepath"
"strings"

jsonpatch "github.com/evanphx/json-patch"
)

// copy from source: kratos/cmd/kratos/internal/base/mod.go
@@ -123,52 +125,93 @@ func Generate() error {
return err
}

err = GenSwagger()
if err != nil {
return err
}
return nil
}

func GenSwagger() error {
dir, err := os.Getwd()
baseDir, err := os.Getwd()
if err != nil {
return err
}
err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if ext := filepath.Ext(path); ext != ".proto" {
return nil
}
if strings.Contains(path, "vendor") {

dirs := []string{filepath.Join(baseDir, "admin-server", "api", "v1"), filepath.Join(baseDir, "openai-server", "api", "v1")}
for _, dir := range dirs {
err = filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if ext := filepath.Ext(path); ext != ".proto" {
return nil
}
if strings.Contains(path, "vendor") {
return nil
}

execDir := filepath.Dir(baseDir)
name := strings.ReplaceAll(path, execDir+string(filepath.Separator), "")
input := []string{
"--proto_path=.",
"--proto_path=" + filepath.Join(os.Getenv("GOPATH"), "src"),
"--proto_path=" + filepath.Join(KratosMod(), "api"),
"--proto_path=" + filepath.Join(KratosMod(), "third_party"),
"--openapiv2_out",
"./",
"--openapiv2_opt",
"logtostderr=true",
name,
}

fd := exec.Command("protoc", input...)
fd.Stdout = os.Stdout
fd.Stderr = os.Stderr
fd.Dir = execDir
if err := fd.Run(); err != nil {
return err
}
fmt.Printf("proto: %s\n", name)
return nil
})
if err != nil {
return err
}
if !strings.Contains(path, "admin-server") &&
!strings.Contains(path, "openai-server") {

swaggerFileName := "swagger.json"
swaggerBytes := []byte(`{}`)
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if !strings.Contains(path, ".swagger.json") {
return nil
}

fileBytes, err := ioutil.ReadFile(path)
if err != nil {
return err
}

swaggerBytes, err = jsonpatch.MergePatch(swaggerBytes, fileBytes)
if err != nil {
return err
}
return nil
})
if err != nil {
return err
}

execDir := filepath.Dir(dir)
name := strings.ReplaceAll(path, execDir+string(filepath.Separator), "")
input := []string{
"--proto_path=.",
"--proto_path=" + filepath.Join(os.Getenv("GOPATH"), "src"),
"--proto_path=" + filepath.Join(KratosMod(), "api"),
"--proto_path=" + filepath.Join(KratosMod(), "third_party"),
"--openapiv2_out",
"./",
"--openapiv2_opt",
"logtostderr=true",
name,
swaggerBytes, err = jsonpatch.MergePatch(swaggerBytes, []byte(`
{
"info": {
"title": "octopus api",
"version": ""
}
}`))
if err != nil {
return err
}

fd := exec.Command("protoc", input...)
fd.Stdout = os.Stdout
fd.Stderr = os.Stderr
fd.Dir = execDir
if err := fd.Run(); err != nil {
err = ioutil.WriteFile(filepath.Join(dir, swaggerFileName), swaggerBytes, 0755)
if err != nil {
return err
}
fmt.Printf("proto: %s\n", name)
return nil
})
if err != nil {
return err
}

return nil


+ 7
- 5
server/go.mod View File

@@ -7,29 +7,31 @@ require (
github.com/bsm/redislock v0.7.1
github.com/dgrijalva/jwt-go v3.2.0+incompatible
github.com/envoyproxy/protoc-gen-validate v0.1.0
github.com/evanphx/json-patch v4.9.0+incompatible
github.com/fsouza/go-dockerclient v1.7.2
github.com/go-kratos/kratos/v2 v2.0.0-beta3
github.com/go-redis/redis/v8 v8.10.0
github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e // indirect
github.com/golang/protobuf v1.4.3
github.com/golang/protobuf v1.5.2
github.com/google/uuid v1.1.2
github.com/gorilla/mux v1.8.0
github.com/imdario/mergo v0.3.11 // indirect
github.com/influxdata/influxdb v1.9.4
github.com/influxdata/influxdb1-client v0.0.0-20200827194710-b269163b24ab // indirect
github.com/jinzhu/copier v0.2.5
github.com/json-iterator/go v1.1.10
github.com/kr/text v0.2.0 // indirect
github.com/minio/minio-go/v7 v7.0.11
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e // indirect
github.com/sony/sonyflake v1.0.0
golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899
golang.org/x/crypto v0.0.0-20201221181555-eec23a3978ad
golang.org/x/net v0.0.0-20210410081132-afb366fc7cd1 // indirect
golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
golang.org/x/text v0.3.6
gonum.org/v1/gonum v0.6.2
google.golang.org/appengine v1.6.5 // indirect
gonum.org/v1/gonum v0.8.2
google.golang.org/genproto v0.0.0-20210212180131-e7f2df4ecc2d
google.golang.org/grpc v1.36.0
google.golang.org/protobuf v1.25.0
google.golang.org/protobuf v1.26.0
gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f // indirect
gopkg.in/errgo.v2 v2.1.0
gopkg.in/resty.v1 v1.12.0


+ 658
- 0
server/go.sum
File diff suppressed because it is too large
View File


+ 1
- 1
server/openai-server/.gitignore View File

@@ -38,4 +38,4 @@ bin/
# pb
*.pb.go
*.pb.validate.go
*.swagger.json
*swagger.json

+ 37
- 1
server/openai-server/api/v1/develop.proto View File

@@ -47,6 +47,12 @@ service Develop {
get: "/v1/developmanage/notebook/{id}"
};
}
// 获取Notebook事件列表
rpc GetNotebookEventList (NotebookEventListRequest) returns (NotebookEventListReply) {
option (google.api.http) = {
get: "/v1/developmanage/notebookevent"
};
};
}

message CreateNotebookRequest {
@@ -127,10 +133,40 @@ message Notebook {
string datasetVersion = 18;
string datasetName = 19;
uint32 resourceSpecPrice = 20;
string initInfo = 21;
}

message ListNotebookReply {
int64 totalSize = 1;
repeated Notebook notebooks = 2;
}

message NotebookEventListRequest {
// 页码,从1开始
int64 pageIndex = 1[(validate.rules).int64 = {gte:1}];
// 页大小,最小1条,最大100条
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
}

message NotebookEventListReply {
//查询结果总数
int64 totalSize = 1;
//任务事件
repeated NotebookEvent notebookEvents = 2;
}

message NotebookEvent{
//发生时间
string timestamp = 1;
//副本名称
string name = 2;
//原因
string reason = 3;
//消息
string message = 4;
}

+ 37
- 2
server/openai-server/api/v1/trainJob.proto View File

@@ -74,6 +74,12 @@ service TrainJobService {
get: "/v1/trainmanage/trainjobtemplate"
};
};
// 获取训练任务事件列表
rpc GetJobEventList (JobEventListRequest) returns (JobEventListReply) {
option (google.api.http) = {
get: "/v1/trainmanage/trainjobevent"
};
};

}

@@ -306,8 +312,6 @@ message TrainJob{
string imageVersion = 21;
//启动时间
int64 startedAt = 22;
//启动信息
string initInfo = 23;
}

message TrainJobTemplate{
@@ -358,3 +362,34 @@ message TrainJobInfoReply{
//训练任务
TrainJob trainJob = 4;
}

message JobEventListRequest {
// 页码,从1开始
int64 pageIndex = 1[(validate.rules).int64 = {gte:1}];
// 页大小,最小1条,最大100条
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
}

message JobEventListReply {
//查询结果总数
int64 totalSize = 1;
//任务事件
repeated JobEvent jobEvents = 2;
}

message JobEvent{
//发生时间
string timestamp = 1;
//副本名称
string name = 2;
//原因
string reason = 3;
//消息
string message = 4;
}

+ 1
- 1
server/openai-server/configs/config.yaml View File

@@ -12,7 +12,7 @@ data:
baseServerAddr: dns:///127.0.0.1:9001
baseServerRequestTimeout: 30s
redis:
addr: 192.168.202.73:31570
addr: 192.168.202.73:30331
username:
password: abcde
service:


+ 26
- 0
server/openai-server/internal/service/develop.go View File

@@ -172,3 +172,29 @@ func (s *DevelopService) QueryNotebook(ctx context.Context, req *api.QueryNotebo

return reply, nil
}

// Notebook事件列表
func (s *DevelopService) GetNotebookEventList(ctx context.Context, req *api.NotebookEventListRequest) (*api.NotebookEventListReply, error) {
session := session.SessionFromContext(ctx)
if session == nil {
return nil, errors.Errorf(nil, errors.ErrorUserNoAuthSession)
}

innerReq := &innerapi.NotebookEventListRequest{}
err := copier.Copy(innerReq, req)
if err != nil {
return nil, errors.Errorf(err, errors.ErrorStructCopy)
}

innerReply, err := s.data.DevelopClient.GetNotebookEventList(ctx, innerReq)
if err != nil {
return nil, err
}

reply := &api.NotebookEventListReply{}
err = copier.Copy(reply, innerReply)
if err != nil {
return nil, err
}
return reply, nil
}

+ 26
- 0
server/openai-server/internal/service/trainjob.go View File

@@ -461,3 +461,29 @@ func (s *TrainJobService) assignValueToTemplate(ctx context.Context, templates [

return nil
}

// 任务事件列表
func (s *TrainJobService) GetJobEventList(ctx context.Context, req *api.JobEventListRequest) (*api.JobEventListReply, error) {
session := session.SessionFromContext(ctx)
if session == nil {
return nil, errors.Errorf(nil, errors.ErrorUserNoAuthSession)
}

innerReq := &innerapi.JobEventListRequest{}
err := copier.Copy(innerReq, req)
if err != nil {
return nil, errors.Errorf(err, errors.ErrorStructCopy)
}

innerReply, err := s.data.TrainJobClient.GetJobEventList(ctx, innerReq)
if err != nil {
return nil, err
}

reply := &api.JobEventListReply{}
err = copier.Copy(reply, innerReply)
if err != nil {
return nil, err
}
return reply, nil
}

+ 0
- 3
server/taskset/pkg/pipeline/services/kubernetes/event_handlers.go View File

@@ -72,8 +72,6 @@ func (s *Service) onTaskSetAdd(obj interface{}) {

ta := s.convert(obj)

InitPodInfoWatch(s, ta.Name, ta.Namespace)

if nil == ta {
return
}
@@ -189,5 +187,4 @@ func (s *Service) onTaskSetDelete(obj interface{}) {

s.emit(ta.Name, jobstate.STOPPED, ta)
s.app.Services().Job().StopJob(ta.Name, ta.Namespace, "exception stopped")
EndPodInfoWatch(ta.Name)
}

+ 0
- 113
server/taskset/pkg/pipeline/services/kubernetes/logs_helper.go View File

@@ -1,113 +0,0 @@
package kubernetes

import (
"time"
"strings"
"k8s.io/client-go/tools/cache"
"k8s.io/apimachinery/pkg/fields"
corev1 "k8s.io/api/core/v1"
"scheduler/pkg/pipeline/utils"
typeJob "volcano.sh/volcano/pkg/apis/batch/v1alpha1"
lrucache "scheduler/pkg/pipeline/utils/lrucache"
)

type JobEvent struct {
JobID string
Namespace string
PodUID map[string]string
PodEvents map[string][][]*typeJob.PodEvent
}

var EventUIDCache = lrucache.NewMemCache(32768)
var PodStopChan = make(map[string](chan struct{}))
var EventStopChan = make(map[string](chan struct{}))


func getTaskRoleName(pod corev1.Pod) string {
idx := strings.Index(pod.Name, "-")
taskRoleName := pod.Name[idx+1:len(pod.Name)]
return taskRoleName
}

func InitPodInfoWatch(s * Service, jobID, namespace string) {
k8sClient := s.GetKubeClient()
podWatchlist := cache.NewListWatchFromClient(k8sClient.CoreV1().RESTClient(), "pods", namespace, fields.Everything())
_, podController := cache.NewInformer(
podWatchlist,
&corev1.Pod{},
time.Second * 0,
cache.ResourceEventHandlerFuncs{
DeleteFunc: func(obj interface{}) {
var pod *corev1.Pod = obj.(*corev1.Pod)
if pod.Labels["volcano.sh/job-name"] == jobID {
podEvent := &typeJob.PodEvent{
UID: utils.GetRandomString(12),
Reason: pod.Status.Reason,
Message:pod.Status.Message,
}
taskRole := getTaskRoleName(*pod)
jobInfo := &typeJob.JobInfo{}
jobInfo.PodRoleName = make(map[string]string)
jobInfo.PodRoleName[taskRole] = taskRole
jobInfo.PodEvents = make(map[string][]*typeJob.PodEvent)
jobInfo.PodEvents[taskRole] = append(jobInfo.PodEvents[taskRole], podEvent)
s.app.Services().Job().UpdateJobSummary(jobID, jobInfo, nil, true)
}
},
},
)

eventWatchlist := cache.NewListWatchFromClient(k8sClient.CoreV1().RESTClient(), "events", namespace, fields.Everything())
_, eventController := cache.NewInformer(
eventWatchlist,
&corev1.Event{},
time.Second * 0,
cache.ResourceEventHandlerFuncs{
AddFunc: func(e interface{}) {
if event, ok := e.(*corev1.Event); ok {
name := event.InvolvedObject.Name
prefix := jobID + "-"
if strings.Index(name, prefix) == 0 {
uid := string(event.UID)