enhance: [skip e2e]enable gpu e2e pipeline (#31821)

Signed-off-by: Liang Huang <sammy.huang@zilliz.com>
2024-04-03 11:41:21 +08:00 · 2024-04-03 11:41:21 +08:00 · ae307af19e
parent 1af2ee903c
commit ae307af19e
3 changed files with 29 additions and 37 deletions
--- a/ci/jenkins/PRGPU.groovy
+++ b/ci/jenkins/PRGPU.groovy
@ -12,13 +12,13 @@ pipeline {
        // buildDiscarder logRotator(artifactDaysToKeepStr: '30')
        // parallelsAlwaysFailFast()
        // preserveStashes(buildCount: 5)
-        // disableConcurrentBuilds(abortPrevious: true)
+        disableConcurrentBuilds(abortPrevious: true)

    }
    agent {
            kubernetes {
-                inheritFrom 'default'
-                defaultContainer 'main'
+                cloud '4am'
+                inheritFrom 'milvus-e2e-4am'
                yamlFile 'ci/jenkins/pod/rte-gpu.yaml'
                customWorkspace '/home/jenkins/agent/workspace'
            }
@ -116,7 +116,7 @@ pipeline {
                                            withCredentials([usernamePassword(credentialsId: "${env.CI_DOCKER_CREDENTIAL_ID}", usernameVariable: 'CI_REGISTRY_USERNAME', passwordVariable: 'CI_REGISTRY_PASSWORD')]){
                                                sh """
                                                MILVUS_CLUSTER_ENABLED=${clusterEnabled} \
-                                                MILVUS_HELM_REPO="http://nexus-nexus-repository-manager.nexus:8081/repository/milvus-proxy" \
+                                                MILVUS_HELM_REPO="https://nexus-ci.zilliz.cc/repository/milvus-proxy" \
                                                TAG=${imageTag}\
                                                ./e2e-k8s.sh \
                                                --skip-export-logs \
@ -133,6 +133,7 @@ pipeline {
                                                --set indexNode.disk.enabled=true \
                                                --set queryNode.disk.enabled=true \
                                                --set standalone.disk.enabled=true \
+                                                --set "tolerations[0].key=node-role.kubernetes.io/gpu,tolerations[0].operator=Exists,tolerations[0].effect=NoSchedule" \
                                                --version ${chart_version} \
                                                -f values/ci/pr-gpu.yaml" 
                                                """
@ -152,6 +153,7 @@ pipeline {
                        }
                        agent {
                                kubernetes {
+                                    cloud '4am'
                                    inheritFrom 'default'
                                    defaultContainer 'main'
                                    yamlFile 'ci/jenkins/pod/e2e.yaml'
@ -177,7 +179,7 @@ pipeline {
                                            MILVUS_HELM_NAMESPACE="milvus-ci" \
                                            MILVUS_CLUSTER_ENABLED="${clusterEnabled}" \
                                            TEST_TIMEOUT="${e2e_timeout_seconds}" \
-                                            ./ci_e2e.sh  "--tags GPU -n 6 -x --timeout ${case_timeout_seconds}"
+                                            ./ci_e2e_4am.sh  "--tags GPU -n 6 -x --timeout ${case_timeout_seconds}"
                                            """
                            
                                        } else {
--- a/ci/jenkins/pod/rte-gpu.yaml
+++ b/ci/jenkins/pod/rte-gpu.yaml
@ -48,18 +48,18 @@ spec:
      subPath: docker-volume-gpu
    - mountPath: /ci-logs
      name: ci-logs  
-  # - name: pytest
-  #   image: harbor.milvus.io/dockerhub/milvusdb/pytest:20230303-0cb8153
-  #   resources:
-  #     limits:
-  #       cpu: "6"
-  #       memory: 12Gi
-  #     requests:
-  #       cpu: "0.5"
-  #       memory: 5Gi
-  #   volumeMounts:
-  #   - mountPath: /ci-logs
-  #     name: ci-logs
+  - name: pytest
+    image: harbor.milvus.io/dockerhub/milvusdb/pytest:20240313-652b866
+    resources:
+      limits:
+        cpu: "6"
+        memory: 12Gi
+      requests:
+        cpu: "0.5"
+        memory: 5Gi
+    volumeMounts:
+    - mountPath: /ci-logs
+      name: ci-logs
  volumes:
  - emptyDir: {}
    name: docker-graph
@ -79,7 +79,6 @@ spec:
    name: cgroup
  - name: ci-logs
    nfs:
-      path: /ci-logs
-      server: 172.16.70.239
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
+      path: /volume1/ci-logs
+      # path: /volume1/4am-logs
+      server: 172.16.70.249
--- a/tests/scripts/values/ci/pr-gpu.yaml
+++ b/tests/scripts/values/ci/pr-gpu.yaml
@ -2,24 +2,18 @@ metrics:
  serviceMonitor:
    enabled: true
 proxy:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
  resources:
    requests:
      cpu: "0.1"
      memory: "256Mi"

 rootCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
  resources:
    requests:
      cpu: "0.1"
      memory: "256Mi"

 queryCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
  resources:
    requests:
      cpu: "0.4"
@ -33,15 +27,13 @@ queryNode:
      value: "0,1"
  resources:
    requests:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1
      cpu: "0.5"
      memory: "500Mi"
    limits:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1

 indexCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
  resources:
    requests:
      cpu: "0.1"
@ -55,23 +47,19 @@ indexNode:
      value: "0,1"
  resources:
    requests:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1
      cpu: "0.5"
      memory: "500Mi"
    limits:
-      nvidia.com/gpu: 2
+      nvidia.com/gpu: 1

 dataCoordinator:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
  resources:
    requests:
      cpu: "0.1"
      memory: "50Mi"

 dataNode:
-  nodeSelector:
-    nvidia.com/gpu.present: 'true'
  resources:
    requests:
      cpu: "0.5"
@ -192,6 +180,9 @@ minio:
      cpu: "0.3"
      memory: "512Mi"
 standalone:
+  persistence:
+    persistentVolumeClaim:
+       storageClass: "local-path"
  nodeSelector:
    nvidia.com/gpu.present: 'true'
  resources: