From ae307af19e8687eb955585400ffb7b56b3b70225 Mon Sep 17 00:00:00 2001
From: "sammy.huang" <sammy.huang@zilliz.com>
Date: Wed, 3 Apr 2024 11:41:21 +0800
Subject: [PATCH] enhance: [skip e2e]enable gpu e2e pipeline (#31821)

Signed-off-by: Liang Huang <sammy.huang@zilliz.com>
---
 ci/jenkins/PRGPU.groovy             | 12 ++++++-----
 ci/jenkins/pod/rte-gpu.yaml         | 31 ++++++++++++++---------------
 tests/scripts/values/ci/pr-gpu.yaml | 23 +++++++--------------
 3 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/ci/jenkins/PRGPU.groovy b/ci/jenkins/PRGPU.groovy
index c828c8db1d..1f532755e3 100644
--- a/ci/jenkins/PRGPU.groovy
+++ b/ci/jenkins/PRGPU.groovy
@@ -12,13 +12,13 @@ pipeline {
         // buildDiscarder logRotator(artifactDaysToKeepStr: '30')
         // parallelsAlwaysFailFast()
         // preserveStashes(buildCount: 5)
-        // disableConcurrentBuilds(abortPrevious: true)
+        disableConcurrentBuilds(abortPrevious: true)
 
     }
     agent {
             kubernetes {
-                inheritFrom 'default'
-                defaultContainer 'main'
+                cloud '4am'
+                inheritFrom 'milvus-e2e-4am'
                 yamlFile 'ci/jenkins/pod/rte-gpu.yaml'
                 customWorkspace '/home/jenkins/agent/workspace'
             }
@@ -116,7 +116,7 @@ pipeline {
                                             withCredentials([usernamePassword(credentialsId: "${env.CI_DOCKER_CREDENTIAL_ID}", usernameVariable: 'CI_REGISTRY_USERNAME', passwordVariable: 'CI_REGISTRY_PASSWORD')]){
                                                 sh """
                                                 MILVUS_CLUSTER_ENABLED=${clusterEnabled} \
-                                                MILVUS_HELM_REPO="http://nexus-nexus-repository-manager.nexus:8081/repository/milvus-proxy" \
+                                                MILVUS_HELM_REPO="https://nexus-ci.zilliz.cc/repository/milvus-proxy" \
                                                 TAG=${imageTag}\
                                                 ./e2e-k8s.sh \
                                                 --skip-export-logs \
@@ -133,6 +133,7 @@ pipeline {
                                                 --set indexNode.disk.enabled=true \
                                                 --set queryNode.disk.enabled=true \
                                                 --set standalone.disk.enabled=true \
+                                                --set "tolerations[0].key=node-role.kubernetes.io/gpu,tolerations[0].operator=Exists,tolerations[0].effect=NoSchedule" \
                                                 --version ${chart_version} \
                                                 -f values/ci/pr-gpu.yaml" 
                                                 """
@@ -152,6 +153,7 @@ pipeline {
                         }
                         agent {
                                 kubernetes {
+                                    cloud '4am'
                                     inheritFrom 'default'
                                     defaultContainer 'main'
                                     yamlFile 'ci/jenkins/pod/e2e.yaml'
@@ -177,7 +179,7 @@ pipeline {
                                             MILVUS_HELM_NAMESPACE="milvus-ci" \
                                             MILVUS_CLUSTER_ENABLED="${clusterEnabled}" \
                                             TEST_TIMEOUT="${e2e_timeout_seconds}" \
-                                            ./ci_e2e.sh  "--tags GPU -n 6 -x --timeout ${case_timeout_seconds}"
+                                            ./ci_e2e_4am.sh  "--tags GPU -n 6 -x --timeout ${case_timeout_seconds}"
                                             """
                             
                                         } else {
diff --git a/ci/jenkins/pod/rte-gpu.yaml b/ci/jenkins/pod/rte-gpu.yaml
index 9fb7229126..275437ede1 100644
--- a/ci/jenkins/pod/rte-gpu.yaml
+++ b/ci/jenkins/pod/rte-gpu.yaml
@@ -48,18 +48,18 @@ spec:
       subPath: docker-volume-gpu
     - mountPath: /ci-logs
       name: ci-logs  
-  # - name: pytest
-  #   image: harbor.milvus.io/dockerhub/milvusdb/pytest:20230303-0cb8153
-  #   resources:
-  #     limits:
-  #       cpu: "6"
-  #       memory: 12Gi
-  #     requests:
-  #       cpu: "0.5"
-  #       memory: 5Gi
-  #   volumeMounts:
-  #   - mountPath: /ci-logs
-  #     name: ci-logs
+  - name: pytest
+    image: harbor.milvus.io/dockerhub/milvusdb/pytest:20240313-652b866
+    resources:
+      limits:
+        cpu: "6"
+        memory: 12Gi
+      requests:
+        cpu: "0.5"
+        memory: 5Gi
+    volumeMounts:
+    - mountPath: /ci-logs
+      name: ci-logs
   volumes:
   - emptyDir: {}
     name: docker-graph
@@ -79,7 +79,6 @@ spec:
     name: cgroup
   - name: ci-logs
     nfs:
-      path: /ci-logs
-      server: 172.16.70.239
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
+      path: /volume1/ci-logs
+      # path: /volume1/4am-logs
+      server: 172.16.70.249
diff --git a/tests/scripts/values/ci/pr-gpu.yaml b/tests/scripts/values/ci/pr-gpu.yaml
index 2bbff8e0fe..b677b8f1e3 100644
--- a/tests/scripts/values/ci/pr-gpu.yaml
+++ b/tests/scripts/values/ci/pr-gpu.yaml
@@ -2,24 +2,18 @@ metrics:
   serviceMonitor:
     enabled: true
 proxy:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
   resources:
     requests:
       cpu: "0.1"
       memory: "256Mi"
 
 rootCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
   resources:
     requests:
       cpu: "0.1"
       memory: "256Mi"
 
 queryCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
   resources:
     requests:
       cpu: "0.4"
@@ -33,15 +27,13 @@ queryNode:
       value: "0,1"
   resources:
     requests:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1
       cpu: "0.5"
       memory: "500Mi"
     limits:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1
 
 indexCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
   resources:
     requests:
       cpu: "0.1"
@@ -55,23 +47,19 @@ indexNode:
       value: "0,1"
   resources:
     requests:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1
       cpu: "0.5"
       memory: "500Mi"
     limits:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1
 
 dataCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
   resources:
     requests:
       cpu: "0.1"
       memory: "50Mi"
 
 dataNode:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
   resources:
     requests:
       cpu: "0.5"
@@ -192,6 +180,9 @@ minio:
       cpu: "0.3"
       memory: "512Mi"
 standalone:
+  persistence:
+    persistentVolumeClaim:
+       storageClass: "local-path"
   nodeSelector:
     nvidia.com/gpu.present: 'true'
   resources: