milvus/internal/storage/vector_chunk_manager.go

340 lines
9.0 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage
import (
"context"
"io"
"path"
"sync"
"time"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"golang.org/x/exp/mmap"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/cache"
)
var (
defaultLocalCacheSize = 64
)
// VectorChunkManager is responsible for read and write vector data.
type VectorChunkManager struct {
cacheStorage ChunkManager
vectorStorage ChunkManager
cache cache.LoadingCache[string, *mmap.ReaderAt]
insertCodec *InsertCodec
cacheEnable bool
cacheLimit int64
cacheSize int64
cacheSizeMutex sync.Mutex
}
var _ ChunkManager = (*VectorChunkManager)(nil)
// NewVectorChunkManager create a new vector manager object.
func NewVectorChunkManager(ctx context.Context, cacheStorage ChunkManager, vectorStorage ChunkManager, cacheLimit int64, cacheEnable bool) (*VectorChunkManager, error) {
insertCodec := NewInsertCodec()
vcm := &VectorChunkManager{
cacheStorage: cacheStorage,
vectorStorage: vectorStorage,
insertCodec: insertCodec,
cacheEnable: cacheEnable,
cacheLimit: cacheLimit,
}
err := vcm.initCache(ctx)
if err != nil {
return nil, err
}
return vcm, nil
}
func (vcm *VectorChunkManager) initCache(ctx context.Context) error {
if !vcm.cacheEnable {
return nil
}
if vcm.cacheLimit <= 0 {
return errors.New("cache limit must be positive if cacheEnable")
}
loader := func(filePath string) (*mmap.ReaderAt, error) {
return vcm.readFile(ctx, filePath)
}
onRemoveFn := func(filePath string, v *mmap.ReaderAt) {
size := v.Len()
err := v.Close()
if err != nil {
log.Error("close mmap file failed", zap.Any("file", filePath))
}
localPath := path.Join(vcm.cacheStorage.RootPath(), filePath)
err = vcm.cacheStorage.Remove(ctx, localPath)
if err != nil {
log.Error("cache storage remove file failed", zap.Any("file", localPath))
}
vcm.cacheSizeMutex.Lock()
vcm.cacheSize -= int64(size)
vcm.cacheSizeMutex.Unlock()
}
vcm.cache = cache.NewLoadingCache(loader,
cache.WithRemovalListener[string, *mmap.ReaderAt](onRemoveFn),
cache.WithMaximumSize[string, *mmap.ReaderAt](vcm.cacheLimit),
)
return nil
}
// For vector data, we will download vector file from storage. And we will
// deserialize the file for it has binlog style. At last we store pure vector
// data to local storage as cache.
func (vcm *VectorChunkManager) deserializeVectorFile(filePath string, content []byte) ([]byte, error) {
blob := &Blob{
Key: filePath,
Value: content,
}
_, _, data, err := vcm.insertCodec.Deserialize([]*Blob{blob})
if err != nil {
return nil, err
}
// Note: here we assume that only one field in the binlog.
var results []byte
for _, singleData := range data.Data {
bs, err := FieldDataToBytes(common.Endian, singleData)
if err == nil {
results = bs
}
}
return results, nil
}
// RootPath returns vector root path
func (vcm *VectorChunkManager) RootPath() string {
return vcm.vectorStorage.RootPath()
}
// Path returns the path of vector data. If cached, return local path.
// If not cached return remote path.
func (vcm *VectorChunkManager) Path(ctx context.Context, filePath string) (string, error) {
return vcm.vectorStorage.Path(ctx, filePath)
}
func (vcm *VectorChunkManager) Size(ctx context.Context, filePath string) (int64, error) {
return vcm.vectorStorage.Size(ctx, filePath)
}
// Write writes the vector data to local cache if cache enabled.
func (vcm *VectorChunkManager) Write(ctx context.Context, filePath string, content []byte) error {
return vcm.vectorStorage.Write(ctx, filePath, content)
}
// MultiWrite writes the vector data to local cache if cache enabled.
func (vcm *VectorChunkManager) MultiWrite(ctx context.Context, contents map[string][]byte) error {
return vcm.vectorStorage.MultiWrite(ctx, contents)
}
// Exist checks whether vector data is saved to local cache.
func (vcm *VectorChunkManager) Exist(ctx context.Context, filePath string) (bool, error) {
return vcm.vectorStorage.Exist(ctx, filePath)
}
func (vcm *VectorChunkManager) readFile(ctx context.Context, filePath string) (*mmap.ReaderAt, error) {
contents, err := vcm.vectorStorage.Read(ctx, filePath)
if err != nil {
return nil, err
}
results, err := vcm.deserializeVectorFile(filePath, contents)
if err != nil {
return nil, err
}
localPath := path.Join(vcm.cacheStorage.RootPath(), filePath)
err = vcm.cacheStorage.Write(ctx, localPath, results)
if err != nil {
return nil, err
}
r, err := vcm.cacheStorage.Mmap(ctx, localPath)
if err != nil {
return nil, err
}
vcm.cacheSizeMutex.Lock()
vcm.cacheSize += int64(r.Len())
vcm.cacheSizeMutex.Unlock()
return r, nil
}
// Read reads the pure vector data. If cached, it reads from local.
func (vcm *VectorChunkManager) Read(ctx context.Context, filePath string) ([]byte, error) {
if vcm.cacheEnable {
r, err := vcm.cache.Get(filePath)
if err != nil {
return nil, err
}
p := make([]byte, r.Len())
_, err = r.ReadAt(p, 0)
if err != nil {
return nil, err
}
return p, nil
}
contents, err := vcm.vectorStorage.Read(ctx, filePath)
if err != nil {
return nil, err
}
return vcm.deserializeVectorFile(filePath, contents)
}
// MultiRead reads the pure vector data. If cached, it reads from local.
func (vcm *VectorChunkManager) MultiRead(ctx context.Context, filePaths []string) ([][]byte, error) {
results := make([][]byte, len(filePaths))
for i, filePath := range filePaths {
content, err := vcm.Read(ctx, filePath)
if err != nil {
return nil, err
}
results[i] = content
}
return results, nil
}
func (vcm *VectorChunkManager) ReadWithPrefix(ctx context.Context, prefix string) ([]string, [][]byte, error) {
filePaths, _, err := vcm.ListWithPrefix(ctx, prefix, true)
if err != nil {
return nil, nil, err
}
results, err := vcm.MultiRead(ctx, filePaths)
if err != nil {
return nil, nil, err
}
return filePaths, results, nil
}
func (vcm *VectorChunkManager) ListWithPrefix(ctx context.Context, prefix string, recursive bool) ([]string, []time.Time, error) {
return vcm.vectorStorage.ListWithPrefix(ctx, prefix, recursive)
}
func (vcm *VectorChunkManager) Mmap(ctx context.Context, filePath string) (*mmap.ReaderAt, error) {
if vcm.cacheEnable && vcm.cache != nil {
if r, err := vcm.cache.Get(filePath); err == nil {
return r, nil
}
}
return nil, errors.New("the file mmap has not been cached")
}
func (vcm *VectorChunkManager) Reader(ctx context.Context, filePath string) (FileReader, error) {
return nil, errors.New("this method has not been implemented")
}
// ReadAt reads specific position data of vector. If cached, it reads from local.
func (vcm *VectorChunkManager) ReadAt(ctx context.Context, filePath string, off int64, length int64) ([]byte, error) {
if vcm.cacheEnable {
r, err := vcm.cache.Get(filePath)
if err != nil {
return nil, err
}
p := make([]byte, length)
_, err = r.ReadAt(p, off)
if err != nil {
return nil, err
}
return p, nil
}
contents, err := vcm.vectorStorage.Read(ctx, filePath)
if err != nil {
return nil, err
}
results, err := vcm.deserializeVectorFile(filePath, contents)
if err != nil {
return nil, err
}
if off < 0 || int64(len(results)) < off {
return nil, errors.New("vectorChunkManager: invalid offset")
}
p := make([]byte, length)
n := copy(p, results[off:])
if n < len(p) {
return nil, io.EOF
}
return p, nil
}
func (vcm *VectorChunkManager) Remove(ctx context.Context, filePath string) error {
err := vcm.vectorStorage.Remove(ctx, filePath)
if err != nil {
return err
}
if vcm.cacheEnable {
vcm.cache.Invalidate(filePath)
}
return nil
}
func (vcm *VectorChunkManager) MultiRemove(ctx context.Context, filePaths []string) error {
err := vcm.vectorStorage.MultiRemove(ctx, filePaths)
if err != nil {
return err
}
if vcm.cacheEnable {
for _, p := range filePaths {
vcm.cache.Invalidate(p)
}
}
return nil
}
func (vcm *VectorChunkManager) RemoveWithPrefix(ctx context.Context, prefix string) error {
err := vcm.vectorStorage.RemoveWithPrefix(ctx, prefix)
if err != nil {
return err
}
if vcm.cacheEnable {
filePaths, _, err := vcm.ListWithPrefix(ctx, prefix, true)
if err != nil {
return err
}
for _, p := range filePaths {
vcm.cache.Invalidate(p)
}
}
return nil
}
func (vcm *VectorChunkManager) Close() {
if vcm.cache != nil && vcm.cacheEnable {
vcm.cache.Close()
}
}