fix: binary vector should not limit dimension to 32768 (#30676)

all the vector dimension check should happen on collection creation but
not index build
fix #30285

Signed-off-by: xiaofanluan <xiaofan.luan@zilliz.com>
pull/31000/head
Xiaofan 2024-03-04 22:21:00 -08:00 committed by GitHub
parent 1936aa4caa
commit 4bda6c33ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 64 additions and 37 deletions

View File

@ -127,6 +127,7 @@ if (LINUX OR MSYS)
"-DELPP_THREAD_SAFE"
"-fopenmp"
"-Wno-error"
"-Wno-all"
)
if (CMAKE_BUILD_TYPE STREQUAL "Release")
append_flags( CMAKE_CXX_FLAGS
@ -141,17 +142,9 @@ if ( APPLE )
"-fPIC"
"-DELPP_THREAD_SAFE"
"-fopenmp"
"-Wno-error"
"-Wsign-compare"
"-Wall"
"-pedantic"
"-Wno-unused-command-line-argument"
"-Wextra"
"-Wno-unused-parameter"
"-Wno-deprecated"
"-Wno-all"
"-DBOOST_STACKTRACE_GNU_SOURCE_NOT_REQUIRED=1"
#"-fvisibility=hidden"
#"-fvisibility-inlines-hidden"
)
endif ()

View File

@ -311,11 +311,21 @@ func validateDimension(field *schemapb.FieldSchema) error {
return errors.New("dimension is not defined in field type params, check type param `dim` for vector field")
}
if dim <= 0 || dim > Params.ProxyCfg.MaxDimension.GetAsInt64() {
return fmt.Errorf("invalid dimension: %d. should be in range 1 ~ %d", dim, Params.ProxyCfg.MaxDimension.GetAsInt())
if dim <= 1 {
return fmt.Errorf("invalid dimension: %d. should be in range 2 ~ %d", dim, Params.ProxyCfg.MaxDimension.GetAsInt())
}
if field.DataType == schemapb.DataType_BinaryVector && dim%8 != 0 {
return fmt.Errorf("invalid dimension: %d. should be multiple of 8. ", dim)
if field.DataType != schemapb.DataType_BinaryVector {
if dim > Params.ProxyCfg.MaxDimension.GetAsInt64() {
return fmt.Errorf("invalid dimension: %d. float vector dimension should be in range 2 ~ %d", dim, Params.ProxyCfg.MaxDimension.GetAsInt())
}
} else {
if dim%8 != 0 {
return fmt.Errorf("invalid dimension: %d. binary vector dimension should be multiple of 8. ", dim)
}
if dim > Params.ProxyCfg.MaxDimension.GetAsInt64()*8 {
return fmt.Errorf("invalid dimension: %d. binary vector dimension should be in range 2 ~ %d", dim, Params.ProxyCfg.MaxDimension.GetAsInt()*8)
}
}
return nil
}

View File

@ -190,6 +190,16 @@ func TestValidateDimension(t *testing.T) {
},
},
}
assert.NotNil(t, validateDimension(fieldSchema))
fieldSchema = &schemapb.FieldSchema{
DataType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "2",
},
},
}
assert.Nil(t, validateDimension(fieldSchema))
fieldSchema.TypeParams = []*commonpb.KeyValuePair{
{
@ -237,6 +247,14 @@ func TestValidateDimension(t *testing.T) {
},
}
assert.NotNil(t, validateDimension(fieldSchema))
fieldSchema.TypeParams = []*commonpb.KeyValuePair{
{
Key: common.DimKey,
Value: "262145",
},
}
assert.NotNil(t, validateDimension(fieldSchema))
}
func TestValidateVectorFieldMetricType(t *testing.T) {

View File

@ -1,6 +1,25 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package indexparamcheck
import (
"fmt"
"math"
"github.com/cockroachdb/errors"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
@ -9,10 +28,10 @@ import (
type baseChecker struct{}
func (c baseChecker) CheckTrain(params map[string]string) error {
if !CheckIntByRange(params, DIM, DefaultMinDim, DefaultMaxDim) {
return errOutOfRange(DIM, DefaultMinDim, DefaultMaxDim)
// vector dimension should be checked on collection creation. this is just some basic check
if !CheckIntByRange(params, DIM, 1, math.MaxInt) {
return fmt.Errorf("failed to check vector dimension, should be larger than 0 and smaller than math.MaxInt")
}
return nil
}

View File

@ -15,11 +15,6 @@ const (
// MaxNList is the upper limit of nlist that used in Index IVFxxx
MaxNList = 65536
// DefaultMinDim is the smallest dimension supported in Milvus
DefaultMinDim = 1
// DefaultMaxDim is the largest dimension supported in Milvus
DefaultMaxDim = 32768
HNSWMinEfConstruction = 1
HNSWMaxEfConstruction = 2147483647
HNSWMinM = 1

View File

@ -57,9 +57,6 @@ func Test_ivfPQChecker_CheckTrain(t *testing.T) {
invalidParamsIVF := copyParams(validParams)
invalidParamsIVF[IVFM] = "NAN"
invalidParamsM := copyParams(validParams)
invalidParamsM[DIM] = strconv.Itoa(65536)
invalidParamsMzero := copyParams(validParams)
invalidParamsMzero[IVFM] = "0"
@ -128,7 +125,6 @@ func Test_ivfPQChecker_CheckTrain(t *testing.T) {
{invalidParamsNbits, false},
{invalidParamsWithoutIVF, false},
{invalidParamsIVF, false},
{invalidParamsM, false},
{invalidParamsMzero, false},
{p1, true},
{p2, true},

View File

@ -49,9 +49,6 @@ func Test_raftIVFPQChecker_CheckTrain(t *testing.T) {
invalidParamsIVF := copyParams(validParams)
invalidParamsIVF[IVFM] = "NAN"
invalidParamsM := copyParams(validParams)
invalidParamsM[DIM] = strconv.Itoa(65536)
validParamsMzero := copyParams(validParams)
validParamsMzero[IVFM] = "0"
@ -135,7 +132,6 @@ func Test_raftIVFPQChecker_CheckTrain(t *testing.T) {
{invalidParamsNbits, false},
{invalidParamsWithoutIVF, false},
{invalidParamsIVF, false},
{invalidParamsM, false},
{validParamsMzero, true},
{p1, true},
{p2, true},

View File

@ -65,7 +65,7 @@ float_field_desc = "float type field"
float_vec_field_desc = "float vector type field"
binary_vec_field_desc = "binary vector type field"
max_dim = 32768
min_dim = 1
min_dim = 2
gracefulTime = 1
default_nlist = 128
compact_segment_num_threshold = 3

View File

@ -120,7 +120,7 @@ class TestMilvusClientCollectionInvalid(TestcaseBase):
client = self._connect(enable_milvus_client_api=True)
collection_name = cf.gen_unique_str(prefix)
# 1. create collection
error = {ct.err_code: 65535, ct.err_msg: f"invalid dimension: {dim}. should be in range 1 ~ 32768"}
error = {ct.err_code: 65535, ct.err_msg: f"invalid dimension: {dim}. should be in range 2 ~ 32768"}
client_w.create_collection(client, collection_name, dim,
check_task=CheckTasks.err_res, check_items=error)
client_w.drop_collection(client, collection_name)

View File

@ -1678,7 +1678,7 @@ class TestCollectionCountBinary(TestcaseBase):
@pytest.fixture(
scope="function",
params=[
1,
8,
1000,
2001
],
@ -1711,12 +1711,12 @@ class TestCollectionCountBinary(TestcaseBase):
expected: check error message successfully
"""
self._connect()
dim = 1
dim = 2
c_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim)
collection_w = self.init_collection_wrap(schema=c_schema,
check_task=CheckTasks.err_res,
check_items={"err_code": 1,
"err_msg": f"invalid dimension: {dim}. should be multiple of 8."})
"err_msg": f"invalid dimension: {dim}. binary vector dimension should be multiple of 8."})
@pytest.mark.tags(CaseLabel.L2)
def test_collection_count_no_entities(self):
@ -4336,7 +4336,7 @@ class TestCollectionMultipleVectorValid(TestcaseBase):
"""
self._connect()
c_name = cf.gen_unique_str(prefix)
another_dim = 1
another_dim = 2
schema = cf.gen_default_collection_schema(primary_field=primary_key, auto_id=auto_id, dim=ct.max_dim,
enable_dynamic_field=enable_dynamic_field,
multiple_dim_array=[another_dim])

View File

@ -473,7 +473,7 @@ class TestCollectionSearchInvalid(TestcaseBase):
"""
# 1. create a collection
nb = 1
dim = 1
dim = 2
fields = [cf.gen_int64_field("int64_1"), cf.gen_int64_field("int64_2"),
cf.gen_float_vec_field(dim=dim)]
schema = cf.gen_collection_schema(fields=fields, primary_field="int64_1")
@ -3402,7 +3402,7 @@ class TestCollectionSearch(TestcaseBase):
"""
# 1. create a collection
nb = 10
dim = 1
dim = 2
fields = [cf.gen_int64_field("int64_1"), cf.gen_int64_field("int64_2"),
cf.gen_float_vec_field(dim=dim)]
schema = cf.gen_collection_schema(fields=fields, primary_field="int64_1")