From ae307af19e8687eb955585400ffb7b56b3b70225 Mon Sep 17 00:00:00 2001 From: "sammy.huang" <sammy.huang@zilliz.com> Date: Wed, 3 Apr 2024 11:41:21 +0800 Subject: [PATCH] enhance: [skip e2e]enable gpu e2e pipeline (#31821) Signed-off-by: Liang Huang <sammy.huang@zilliz.com> --- ci/jenkins/PRGPU.groovy | 12 ++++++----- ci/jenkins/pod/rte-gpu.yaml | 31 ++++++++++++++--------------- tests/scripts/values/ci/pr-gpu.yaml | 23 +++++++-------------- 3 files changed, 29 insertions(+), 37 deletions(-) diff --git a/ci/jenkins/PRGPU.groovy b/ci/jenkins/PRGPU.groovy index c828c8db1d..1f532755e3 100644 --- a/ci/jenkins/PRGPU.groovy +++ b/ci/jenkins/PRGPU.groovy @@ -12,13 +12,13 @@ pipeline { // buildDiscarder logRotator(artifactDaysToKeepStr: '30') // parallelsAlwaysFailFast() // preserveStashes(buildCount: 5) - // disableConcurrentBuilds(abortPrevious: true) + disableConcurrentBuilds(abortPrevious: true) } agent { kubernetes { - inheritFrom 'default' - defaultContainer 'main' + cloud '4am' + inheritFrom 'milvus-e2e-4am' yamlFile 'ci/jenkins/pod/rte-gpu.yaml' customWorkspace '/home/jenkins/agent/workspace' } @@ -116,7 +116,7 @@ pipeline { withCredentials([usernamePassword(credentialsId: "${env.CI_DOCKER_CREDENTIAL_ID}", usernameVariable: 'CI_REGISTRY_USERNAME', passwordVariable: 'CI_REGISTRY_PASSWORD')]){ sh """ MILVUS_CLUSTER_ENABLED=${clusterEnabled} \ - MILVUS_HELM_REPO="http://nexus-nexus-repository-manager.nexus:8081/repository/milvus-proxy" \ + MILVUS_HELM_REPO="https://nexus-ci.zilliz.cc/repository/milvus-proxy" \ TAG=${imageTag}\ ./e2e-k8s.sh \ --skip-export-logs \ @@ -133,6 +133,7 @@ pipeline { --set indexNode.disk.enabled=true \ --set queryNode.disk.enabled=true \ --set standalone.disk.enabled=true \ + --set "tolerations[0].key=node-role.kubernetes.io/gpu,tolerations[0].operator=Exists,tolerations[0].effect=NoSchedule" \ --version ${chart_version} \ -f values/ci/pr-gpu.yaml" """ @@ -152,6 +153,7 @@ pipeline { } agent { kubernetes { + cloud '4am' inheritFrom 'default' defaultContainer 'main' yamlFile 'ci/jenkins/pod/e2e.yaml' @@ -177,7 +179,7 @@ pipeline { MILVUS_HELM_NAMESPACE="milvus-ci" \ MILVUS_CLUSTER_ENABLED="${clusterEnabled}" \ TEST_TIMEOUT="${e2e_timeout_seconds}" \ - ./ci_e2e.sh "--tags GPU -n 6 -x --timeout ${case_timeout_seconds}" + ./ci_e2e_4am.sh "--tags GPU -n 6 -x --timeout ${case_timeout_seconds}" """ } else { diff --git a/ci/jenkins/pod/rte-gpu.yaml b/ci/jenkins/pod/rte-gpu.yaml index 9fb7229126..275437ede1 100644 --- a/ci/jenkins/pod/rte-gpu.yaml +++ b/ci/jenkins/pod/rte-gpu.yaml @@ -48,18 +48,18 @@ spec: subPath: docker-volume-gpu - mountPath: /ci-logs name: ci-logs - # - name: pytest - # image: harbor.milvus.io/dockerhub/milvusdb/pytest:20230303-0cb8153 - # resources: - # limits: - # cpu: "6" - # memory: 12Gi - # requests: - # cpu: "0.5" - # memory: 5Gi - # volumeMounts: - # - mountPath: /ci-logs - # name: ci-logs + - name: pytest + image: harbor.milvus.io/dockerhub/milvusdb/pytest:20240313-652b866 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "0.5" + memory: 5Gi + volumeMounts: + - mountPath: /ci-logs + name: ci-logs volumes: - emptyDir: {} name: docker-graph @@ -79,7 +79,6 @@ spec: name: cgroup - name: ci-logs nfs: - path: /ci-logs - server: 172.16.70.239 - nodeSelector: - nvidia.com/gpu.present: 'true' + path: /volume1/ci-logs + # path: /volume1/4am-logs + server: 172.16.70.249 diff --git a/tests/scripts/values/ci/pr-gpu.yaml b/tests/scripts/values/ci/pr-gpu.yaml index 2bbff8e0fe..b677b8f1e3 100644 --- a/tests/scripts/values/ci/pr-gpu.yaml +++ b/tests/scripts/values/ci/pr-gpu.yaml @@ -2,24 +2,18 @@ metrics: serviceMonitor: enabled: true proxy: - nodeSelector: - nvidia.com/gpu.present: 'true' resources: requests: cpu: "0.1" memory: "256Mi" rootCoordinator: - nodeSelector: - nvidia.com/gpu.present: 'true' resources: requests: cpu: "0.1" memory: "256Mi" queryCoordinator: - nodeSelector: - nvidia.com/gpu.present: 'true' resources: requests: cpu: "0.4" @@ -33,15 +27,13 @@ queryNode: value: "0,1" resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 cpu: "0.5" memory: "500Mi" limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 indexCoordinator: - nodeSelector: - nvidia.com/gpu.present: 'true' resources: requests: cpu: "0.1" @@ -55,23 +47,19 @@ indexNode: value: "0,1" resources: requests: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 cpu: "0.5" memory: "500Mi" limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 dataCoordinator: - nodeSelector: - nvidia.com/gpu.present: 'true' resources: requests: cpu: "0.1" memory: "50Mi" dataNode: - nodeSelector: - nvidia.com/gpu.present: 'true' resources: requests: cpu: "0.5" @@ -192,6 +180,9 @@ minio: cpu: "0.3" memory: "512Mi" standalone: + persistence: + persistentVolumeClaim: + storageClass: "local-path" nodeSelector: nvidia.com/gpu.present: 'true' resources: