fix: [2.5] Decode unicode for json key in expression (#38653)

issue: #38626 

master pr: #38651

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
pull/38663/head
cai.zhang 2024-12-23 13:58:49 +08:00 committed by GitHub
parent 7d46a8f17e
commit bb3d993da5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 82 additions and 32 deletions

View File

@ -28,6 +28,7 @@ func (v *ParserVisitor) VisitParens(ctx *parser.ParensContext) interface{} {
}
func (v *ParserVisitor) translateIdentifier(identifier string) (*ExprWithType, error) {
identifier = decodeUnicode(identifier)
field, err := v.schema.GetFieldFromNameDefaultJSON(identifier)
if err != nil {
return nil, err
@ -1005,6 +1006,7 @@ func (v *ParserVisitor) VisitBitOr(ctx *parser.BitOrContext) interface{} {
*/
// More tests refer to plan_parser_v2_test.go::Test_JSONExpr
func (v *ParserVisitor) getColumnInfoFromJSONIdentifier(identifier string) (*planpb.ColumnInfo, error) {
identifier = decodeUnicode(identifier)
fieldName := strings.Split(identifier, "[")[0]
nestedPath := make([]string, 0)
field, err := v.schema.GetFieldFromNameDefaultJSON(fieldName)

View File

@ -2,9 +2,7 @@ package planparserv2
import (
"fmt"
"strings"
"time"
"unicode"
"github.com/antlr4-go/antlr/v4"
"github.com/hashicorp/golang-lru/v2/expirable"
@ -153,36 +151,6 @@ func CreateRetrievePlan(schema *typeutil.SchemaHelper, exprStr string, exprTempl
return planNode, nil
}
func convertHanToASCII(s string) string {
var builder strings.Builder
builder.Grow(len(s) * 6)
skipCur := false
n := len(s)
for i, r := range s {
if skipCur {
builder.WriteRune(r)
skipCur = false
continue
}
if r == '\\' {
if i+1 < n && !isEscapeCh(s[i+1]) {
return s
}
skipCur = true
builder.WriteRune(r)
continue
}
if unicode.Is(unicode.Han, r) {
builder.WriteString(formatUnicode(uint32(r)))
} else {
builder.WriteRune(r)
}
}
return builder.String()
}
func CreateSearchPlan(schema *typeutil.SchemaHelper, exprStr string, vectorFieldName string, queryInfo *planpb.QueryInfo, exprTemplateValues map[string]*schemapb.TemplateValue) (*planpb.PlanNode, error) {
parse := func() (*planpb.Expr, error) {
if len(exprStr) <= 0 {

View File

@ -1473,3 +1473,36 @@ func BenchmarkTemplateWithString(b *testing.B) {
assert.NotNil(b, plan)
}
}
func TestNestedPathWithChinese(t *testing.T) {
schema := newTestSchemaHelper(t)
expr := `A["姓名"] == "小明"`
plan, err := CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{
Topk: 0,
MetricType: "",
SearchParams: "",
RoundDecimal: 0,
}, nil)
assert.NoError(t, err, expr)
paths := plan.GetVectorAnns().GetPredicates().GetUnaryRangeExpr().GetColumnInfo().GetNestedPath()
assert.NotNil(t, paths)
assert.Equal(t, 2, len(paths))
assert.Equal(t, "A", paths[0])
assert.Equal(t, "姓名", paths[1])
expr = `A["年份"]["月份"] == "九月"`
plan, err = CreateSearchPlan(schema, expr, "FloatVectorField", &planpb.QueryInfo{
Topk: 0,
MetricType: "",
SearchParams: "",
RoundDecimal: 0,
}, nil)
assert.NoError(t, err, expr)
paths = plan.GetVectorAnns().GetPredicates().GetUnaryRangeExpr().GetColumnInfo().GetNestedPath()
assert.NotNil(t, paths)
assert.Equal(t, 3, len(paths))
assert.Equal(t, "A", paths[0])
assert.Equal(t, "年份", paths[1])
assert.Equal(t, "月份", paths[2])
}

View File

@ -2,8 +2,10 @@ package planparserv2
import (
"fmt"
"regexp"
"strconv"
"strings"
"unicode"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/json"
@ -730,3 +732,41 @@ func parseJSONValue(value interface{}) (*planpb.GenericValue, schemapb.DataType,
return nil, schemapb.DataType_None, fmt.Errorf("%v is of unknown type: %T\n", value, v)
}
}
func convertHanToASCII(s string) string {
var builder strings.Builder
builder.Grow(len(s) * 6)
skipCur := false
n := len(s)
for i, r := range s {
if skipCur {
builder.WriteRune(r)
skipCur = false
continue
}
if r == '\\' {
if i+1 < n && !isEscapeCh(s[i+1]) {
return s
}
skipCur = true
builder.WriteRune(r)
continue
}
if unicode.Is(unicode.Han, r) {
builder.WriteString(formatUnicode(uint32(r)))
} else {
builder.WriteRune(r)
}
}
return builder.String()
}
func decodeUnicode(input string) string {
re := regexp.MustCompile(`\\u[0-9a-fA-F]{4}`)
return re.ReplaceAllStringFunc(input, func(match string) string {
code, _ := strconv.ParseInt(match[2:], 16, 32)
return string(rune(code))
})
}

View File

@ -328,3 +328,10 @@ func Test_getArrayElementType(t *testing.T) {
assert.Equal(t, schemapb.DataType_None, getArrayElementType(expr))
})
}
func Test_decodeUnicode(t *testing.T) {
s1 := "A[\"\\u5e74\\u4efd\"][\"\\u6708\\u4efd\"]"
assert.NotEqual(t, `A["年份"]["月份"]`, s1)
assert.Equal(t, `A["年份"]["月份"]`, decodeUnicode(s1))
}