enhance: prevent multiple query nodes from causing excessive occupancy of a single node, leading to GPU memory overflow (#39276) (#38617)

issue: #39276 Signed-off-by: yusheng.ma <yusheng.ma@zilliz.com>
2025-01-15 20:15:01 +08:00 · 2025-01-15 20:15:01 +08:00 · 38881bf591
parent 0df2c75b77
commit 38881bf591
7 changed files with 179 additions and 10 deletions
--- a/2
+++ b/2
@ -95,7 +95,7 @@ milvus-gpu: build-cpp-gpu print-gpu-build-info
 	@source $(PWD)/scripts/setenv.sh && \
 		mkdir -p $(INSTALL_PATH) && go env -w CGO_ENABLED="1" && \
 		CGO_LDFLAGS="$(CGO_LDFLAGS)" CGO_CFLAGS="$(CGO_CFLAGS)" GO111MODULE=on $(GO) build -pgo=$(PGO_PATH)/default.pgo -ldflags="-r $${RPATH} -X '$(OBJPREFIX).BuildTags=$(BUILD_TAGS_GPU)' -X '$(OBJPREFIX).BuildTime=$(BUILD_TIME)' -X '$(OBJPREFIX).GitCommit=$(GIT_COMMIT)' -X '$(OBJPREFIX).GoVersion=$(GO_VERSION)'" \
-		-tags $(MILVUS_GO_BUILD_TAGS) -o $(INSTALL_PATH)/milvus $(PWD)/cmd/main.go 1>/dev/null
+		-tags "$(MILVUS_GO_BUILD_TAGS),cuda" -o $(INSTALL_PATH)/milvus $(PWD)/cmd/main.go 1>/dev/null

 get-build-deps:
 	@(env bash $(PWD)/scripts/install_deps.sh)
--- a/configs/milvus.yaml
+++ b/configs/milvus.yaml
@ -1101,6 +1101,7 @@ trace:
 gpu:
  initMemSize: 2048 # Gpu Memory Pool init size
  maxMemSize: 4096 # Gpu Memory Pool Max size
+  overloadedMemoryThresholdPercentage: 95

 # Any configuration related to the streaming node server.
 streamingNode:
--- a/internal/querynodev2/segments/segment_interface.go
+++ b/internal/querynodev2/segments/segment_interface.go
@ -34,6 +34,7 @@ type ResourceUsage struct {
 	MemorySize         uint64
 	DiskSize           uint64
 	MmapFieldCount     int
+	FieldGpuMemorySize []uint64
 }

 // Segment is the interface of a segment implementation.
--- a/internal/querynodev2/segments/segment_loader.go
+++ b/internal/querynodev2/segments/segment_loader.go
@ -27,6 +27,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"math"
 	"path"
 	"runtime/debug"
 	"strconv"
@ -44,6 +45,7 @@ import (
 	"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
 	"github.com/milvus-io/milvus/internal/querynodev2/pkoracle"
 	"github.com/milvus-io/milvus/internal/storage"
+	"github.com/milvus-io/milvus/internal/util/vecindexmgr"
 	"github.com/milvus-io/milvus/pkg/common"
 	"github.com/milvus-io/milvus/pkg/log"
 	"github.com/milvus-io/milvus/pkg/metrics"
@ -1384,6 +1386,7 @@ func (loader *segmentLoader) checkSegmentSize(ctx context.Context, segmentLoadIn
 	maxSegmentSize := uint64(0)
 	predictMemUsage := memUsage
 	predictDiskUsage := diskUsage
+	var predictGpuMemUsage []uint64
 	mmapFieldCount := 0
 	for _, loadInfo := range segmentLoadInfos {
 		collection := loader.manager.Collection.Get(loadInfo.GetCollectionID())
@ -1406,6 +1409,7 @@ func (loader *segmentLoader) checkSegmentSize(ctx context.Context, segmentLoadIn
 		mmapFieldCount += usage.MmapFieldCount
 		predictDiskUsage += usage.DiskSize
 		predictMemUsage += usage.MemorySize
+		predictGpuMemUsage = usage.FieldGpuMemorySize
 		if usage.MemorySize > maxSegmentSize {
 			maxSegmentSize = usage.MemorySize
 		}
@ -1440,6 +1444,10 @@ func (loader *segmentLoader) checkSegmentSize(ctx context.Context, segmentLoadIn
 			paramtable.Get().QueryNodeCfg.MaxDiskUsagePercentage.GetAsFloat()))
 	}

+	err := checkSegmentGpuMemSize(predictGpuMemUsage, float32(paramtable.Get().GpuConfig.OverloadedMemoryThresholdPercentage.GetAsFloat()))
+	if err != nil {
+		return 0, 0, err
+	}
 	return predictMemUsage - memUsage, predictDiskUsage - diskUsage, nil
 }

@ -1448,6 +1456,7 @@ func getResourceUsageEstimateOfSegment(schema *schemapb.CollectionSchema, loadIn
 	var segmentMemorySize, segmentDiskSize uint64
 	var indexMemorySize uint64
 	var mmapFieldCount int
+	var fieldGpuMemorySize []uint64

 	fieldID2IndexInfo := make(map[int64]*querypb.FieldIndexInfo)
 	for _, fieldIndexInfo := range loadInfo.IndexInfos {
@ -1492,9 +1501,11 @@ func getResourceUsageEstimateOfSegment(schema *schemapb.CollectionSchema, loadIn
 					loadInfo.GetSegmentID(),
 					fieldIndexInfo.GetBuildID())
 			}
-
 			indexMemorySize += estimateResult.MaxMemoryCost
 			segmentDiskSize += estimateResult.MaxDiskCost
+			if vecindexmgr.GetVecIndexMgrInstance().IsGPUVecIndex(common.GetIndexType(fieldIndexInfo.IndexParams)) {
+				fieldGpuMemorySize = append(fieldGpuMemorySize, estimateResult.MaxMemoryCost)
+			}
 			if !estimateResult.HasRawData && !isVectorType {
 				shouldCalculateDataSize = true
 			}
@ -1558,6 +1569,7 @@ func getResourceUsageEstimateOfSegment(schema *schemapb.CollectionSchema, loadIn
 		MemorySize:         segmentMemorySize + indexMemorySize,
 		DiskSize:           segmentDiskSize,
 		MmapFieldCount:     mmapFieldCount,
+		FieldGpuMemorySize: fieldGpuMemorySize,
 	}, nil
 }

@ -1680,3 +1692,39 @@ func getBinlogDataMemorySize(fieldBinlog *datapb.FieldBinlog) int64 {

 	return fieldSize
 }
+
+func checkSegmentGpuMemSize(fieldGpuMemSizeList []uint64, OverloadedMemoryThresholdPercentage float32) error {
+	gpuInfos, err := hardware.GetAllGPUMemoryInfo()
+	if err != nil {
+		if len(fieldGpuMemSizeList) == 0 {
+			return nil
+		}
+		return err
+	}
+	var usedGpuMem []uint64
+	var maxGpuMemSize []uint64
+	for _, gpuInfo := range gpuInfos {
+		usedGpuMem = append(usedGpuMem, gpuInfo.TotalMemory-gpuInfo.FreeMemory)
+		maxGpuMemSize = append(maxGpuMemSize, uint64(float32(gpuInfo.TotalMemory)*OverloadedMemoryThresholdPercentage))
+	}
+	currentGpuMem := usedGpuMem
+	for _, fieldGpuMem := range fieldGpuMemSizeList {
+		var minId int = -1
+		var minGpuMem uint64 = math.MaxUint64
+		for i := int(0); i < len(gpuInfos); i++ {
+			GpuiMem := currentGpuMem[i] + fieldGpuMem
+			if GpuiMem < maxGpuMemSize[i] && GpuiMem < minGpuMem {
+				minId = i
+				minGpuMem = GpuiMem
+			}
+		}
+		if minId == -1 {
+			return fmt.Errorf("load segment failed, GPU OOM if loaded, GpuMemUsage(bytes) = %v, usedGpuMem(bytes) = %v, maxGPUMem(bytes) = %v",
+				fieldGpuMem,
+				usedGpuMem,
+				maxGpuMemSize)
+		}
+		currentGpuMem[minId] += minGpuMem
+	}
+	return nil
+}
--- a/pkg/util/hardware/gpu_mem_info.go
+++ b/pkg/util/hardware/gpu_mem_info.go
@ -0,0 +1,18 @@
+//go:build !cuda
+// +build !cuda
+
+package hardware
+
+import "github.com/cockroachdb/errors"
+
+// GPUMemoryInfo holds information about a GPU's memory
+type GPUMemoryInfo struct {
+	TotalMemory uint64 // Total memory available on the GPU
+	FreeMemory  uint64 // Free memory available on the GPU
+}
+
+// GetAllGPUMemoryInfo returns mock GPU memory information for non-CUDA builds
+func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) {
+	// Mock error to indicate no CUDA support
+	return nil, errors.New("CUDA not supported: failed to retrieve GPU memory info or no GPUs found")
+}
--- a/pkg/util/hardware/gpu_mem_info_cuda.go
+++ b/pkg/util/hardware/gpu_mem_info_cuda.go
@ -0,0 +1,90 @@
+//go:build cuda
+// +build cuda
+
+package hardware
+
+/*
+#cgo CFLAGS: -I/usr/local/cuda/include
+#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+// Structure to store GPU memory info
+typedef struct {
+    size_t totalMemory;
+    size_t freeMemory;
+} GPUMemoryInfo;
+
+// Function to get memory info for all GPUs
+int getAllGPUMemoryInfo(GPUMemoryInfo** infos) {
+    int deviceCount = 0;
+    cudaError_t err = cudaGetDeviceCount(&deviceCount);
+    if (err != cudaSuccess || deviceCount == 0) {
+        return 0; // No GPUs found or error occurred
+    }
+
+    // Allocate memory for the output array
+    *infos = (GPUMemoryInfo*)malloc(deviceCount * sizeof(GPUMemoryInfo));
+    if (*infos == NULL) {
+        return 0; // Memory allocation failed
+    }
+
+    for (int i = 0; i < deviceCount; ++i) {
+        if (cudaSetDevice(i) != cudaSuccess) {
+            (*infos)[i].totalMemory = 0;
+            (*infos)[i].freeMemory = 0;
+            continue; // Skip if the device cannot be set
+        }
+
+        size_t freeMem = 0, totalMem = 0;
+        if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess) {
+            (*infos)[i].totalMemory = 0;
+            (*infos)[i].freeMemory = 0;
+            continue; // Skip if memory info cannot be fetched
+        }
+
+        (*infos)[i].totalMemory = totalMem;
+        (*infos)[i].freeMemory = freeMem;
+    }
+
+    return deviceCount; // Return the number of devices processed
+}
+*/
+import "C"
+import (
+	"github.com/cockroachdb/errors"
+	"unsafe"
+)
+
+// GPUMemoryInfo represents a single GPU's memory information.
+type GPUMemoryInfo struct {
+	TotalMemory uint64 // Total memory in bytes
+	FreeMemory  uint64 // Free memory in bytes
+}
+
+// GetAllGPUMemoryInfo retrieves the memory information for all available GPUs.
+// It returns a slice of GPUMemoryInfo and an error if no GPUs are found or retrieval fails.
+func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) {
+	var infos *C.GPUMemoryInfo
+
+	// Call the C function to retrieve GPU memory info
+	deviceCount := int(C.getAllGPUMemoryInfo(&infos))
+	if deviceCount == 0 {
+		return nil, errors.New("failed to retrieve GPU memory info or no GPUs found")
+	}
+	defer C.free(unsafe.Pointer(infos)) // Free the allocated memory
+
+	// Convert C array to Go slice
+	gpuInfos := make([]GPUMemoryInfo, 0, deviceCount)
+	infoArray := (*[1 << 30]C.GPUMemoryInfo)(unsafe.Pointer(infos))[:deviceCount:deviceCount]
+
+	for i := 0; i < deviceCount; i++ {
+		info := infoArray[i]
+		gpuInfos = append(gpuInfos, GPUMemoryInfo{
+			TotalMemory: uint64(info.totalMemory),
+			FreeMemory:  uint64(info.freeMemory),
+		})
+	}
+
+	return gpuInfos, nil
+}
--- a/pkg/util/paramtable/component_param.go
+++ b/pkg/util/paramtable/component_param.go
@ -972,6 +972,7 @@ This helps Milvus-CDC synchronize incremental data`,
 type gpuConfig struct {
 	InitSize                            ParamItem `refreshable:"false"`
 	MaxSize                             ParamItem `refreshable:"false"`
+	OverloadedMemoryThresholdPercentage ParamItem `refreshable:"false"`
 }

 func (t *gpuConfig) init(base *BaseTable) {
@ -992,6 +993,16 @@ func (t *gpuConfig) init(base *BaseTable) {
 		DefaultValue: "4096",
 	}
 	t.MaxSize.Init(base.mgr)
+	t.OverloadedMemoryThresholdPercentage = ParamItem{
+		Key:          "gpu.overloadedMemoryThresholdPercentage",
+		Version:      "2.5.4",
+		Export:       true,
+		DefaultValue: "95",
+		Formatter: func(v string) string {
+			return fmt.Sprintf("%f", getAsFloat(v)/100)
+		},
+	}
+	t.OverloadedMemoryThresholdPercentage.Init(base.mgr)
 }

 type traceConfig struct {