mirror of https://github.com/milvus-io/milvus.git
add Tanimoto ground truth (#1138)
* add milvus ground truth * add milvus groundtruth * [skip ci] add milvus ground truth * [skip ci]add tanimoto ground truthpull/1206/head
parent
003b0cc27a
commit
a7d57d7a82
|
@ -5,28 +5,30 @@
|
|||
|
||||
### Parameter Description:
|
||||
|
||||
| parameter | description | default setting |
|
||||
| ------------------ | --------------------------------------------- | -------------------- |
|
||||
| GET_VEC | whether to save feature vectors | False |
|
||||
| PROCESS_NUM | number of processes | 5 |
|
||||
| IP | whether metric_type is IP | True |
|
||||
| L2 | whether metric_type is L2 | False |
|
||||
| CSV | whether the query vector file format is csv | False |
|
||||
| UINT8 | whether the query vector data format is uint8 | False |
|
||||
| BASE_FOLDER_NAME | path to the source vector dataset | '/data/milvus/base' |
|
||||
| NQ_FOLDER_NAME | path to the query vector dataset | '/data/milvus/query' |
|
||||
| GT_ALL_FOLDER_NAME | intermediate filename | 'ground_truth_all' |
|
||||
| GT_FOLDER_NAME | path saved the ground truth results | 'ground_truth' |
|
||||
| LOC_FILE_NAME | file saved the gorund truth's location info | 'location.txt' |
|
||||
| FLOC_FILE_NAME | file saved the gorund truth's filenames info | 'file_location.txt' |
|
||||
| VEC_FILE_NAME | file saved the gorund truth's feature vectors | 'vectors.npy' |
|
||||
| parameter | description | default setting |
|
||||
| ------------------ | --------------------------------------------- | ----------------------- |
|
||||
| PROCESS_NUM | number of processes | 12 |
|
||||
| GET_VEC | whether to save feature vectors | False |
|
||||
| CSV | whether the query vector file format is csv | False |
|
||||
| UINT8 | whether the query vector data format is uint8 | False |
|
||||
| BASE_FOLDER_NAME | path to the source vector dataset | '/data/milvus/base' |
|
||||
| NQ_FOLDER_NAME | path to the query vector dataset | '/data/milvus/query' |
|
||||
| GT_ALL_FOLDER_NAME | intermediate filename | 'ground_truth_all' |
|
||||
| GT_FOLDER_NAME | path saved the ground truth results | 'ground_truth' |
|
||||
| LOC_FILE_NAME | file saved the gorund truth's location info | 'ground_truth.txt' |
|
||||
| FLOC_FILE_NAME | file saved the gorund truth's filenames info | 'file_ground_truth.txt' |
|
||||
| VEC_FILE_NAME | file saved the gorund truth's feature vectors | 'vectors.npy' |
|
||||
|
||||
### Usage:
|
||||
|
||||
```bash
|
||||
$ python3 milvus_ground_truth.py [-q <nq_num>] -k <topk_num> -l
|
||||
$ python3 milvus_ground_truth.py [-q <nq_num>] -k <topk_num> -m <metric typr>-l
|
||||
|
||||
# -q or --nq points the number of vectors taken from the query vector set. This parameter is optional, Without it will take all the data in the query set.
|
||||
|
||||
# -k or --topk points calculate the top k similar vectors.
|
||||
# -l means generate the ground truth results, it will save in GT_FOLDER_NAME.In this path, LOC_FILE_NAME saved the gorund truth's location info, such as "8002005210",the first ‘8’ is meaningless, the 2-4th position means the position of the result file in the folder, the 5-10th position means the position of the result vector in the result file. The result filename and vector location saved in FLOC_FILE_NAME, such as "binary_128d_00000.npy 81759", and the result vector is saved in VEC_FILE_NAME.
|
||||
|
||||
# -m or --metric points the method vector distances are compared in Milvus,such as IP/L2/Tan.
|
||||
|
||||
# -l means generate the ground truth results, it will save in GT_FOLDER_NAME.In this path, LOC_FILE_NAME saved the gorund truth's results info, such as "8002005210",the first ‘8’ is meaningless, the 2-4th position means the position of the result file in the folder, the 5-10th position means the position of the result vector in the result file. The result filename and vector location saved in FLOC_FILE_NAME, such as "binary_128d_00000.npy 81759", and the result vector is saved in VEC_FILE_NAME.
|
||||
```
|
|
@ -2,14 +2,13 @@ import getopt
|
|||
import os
|
||||
import sys
|
||||
import time
|
||||
from multiprocessing import Process
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from concurrent.futures import ProcessPoolExecutor
|
||||
import numpy as np
|
||||
import math
|
||||
|
||||
PROCESS_NUM = 12
|
||||
GET_VEC = False
|
||||
PROCESS_NUM = 5
|
||||
|
||||
IP = True
|
||||
L2 = False
|
||||
CSV = False
|
||||
UINT8 = False
|
||||
|
||||
|
@ -18,31 +17,24 @@ NQ_FOLDER_NAME = '/data/milvus/query'
|
|||
|
||||
GT_ALL_FOLDER_NAME = 'ground_truth_all'
|
||||
GT_FOLDER_NAME = 'ground_truth'
|
||||
LOC_FILE_NAME = 'location.txt'
|
||||
FLOC_FILE_NAME = 'file_location.txt'
|
||||
LOC_FILE_NAME = 'ground_truth.txt'
|
||||
FLOC_FILE_NAME = 'file_ground_truth.txt'
|
||||
VEC_FILE_NAME = 'vectors.npy'
|
||||
|
||||
|
||||
# get vectors of the files
|
||||
def load_nq_vec(nq):
|
||||
vectors = []
|
||||
length = 0
|
||||
def load_query_vec(nq, vectors=[], length=0):
|
||||
filenames = os.listdir(NQ_FOLDER_NAME)
|
||||
filenames.sort()
|
||||
if nq == 0:
|
||||
for filename in filenames:
|
||||
vectors += load_vec_list(NQ_FOLDER_NAME + '/' + filename)
|
||||
return vectors
|
||||
for filename in filenames:
|
||||
vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename)
|
||||
length += len(vec_list)
|
||||
if length > nq:
|
||||
if nq!=0 and length>nq :
|
||||
num = nq % len(vec_list)
|
||||
# vec_list = load_vec_list(NQ_FOLDER_NAME + '/' + filename, num)
|
||||
vec_list = vec_list[0:num]
|
||||
vectors += vec_list[0:num]
|
||||
break
|
||||
vectors += vec_list
|
||||
if len(vectors) == nq:
|
||||
return vectors
|
||||
return vectors
|
||||
|
||||
|
||||
# load vectors from filr_name and num means nq's number
|
||||
|
@ -55,12 +47,20 @@ def load_vec_list(file_name):
|
|||
data = np.load(file_name)
|
||||
if UINT8:
|
||||
data = (data + 0.5) / 255
|
||||
vec_list = []
|
||||
for i in range(len(data)):
|
||||
vec_list.append(data[i].tolist())
|
||||
vec_list = data.tolist()
|
||||
return vec_list
|
||||
|
||||
|
||||
def hex_to_bin(fp):
|
||||
vec=[]
|
||||
length = len(fp) * 4
|
||||
bstr = str(bin(int(fp,16)))
|
||||
bstr = (length-(len(bstr)-2)) * '0' + bstr[2:]
|
||||
for f in bstr:
|
||||
vec.append(int(f))
|
||||
return vec
|
||||
|
||||
|
||||
def calEuclideanDistance(vec1, vec2):
|
||||
vec1 = np.array(vec1)
|
||||
vec2 = np.array(vec2)
|
||||
|
@ -75,6 +75,18 @@ def calInnerDistance(vec1, vec2):
|
|||
return dist
|
||||
|
||||
|
||||
def calTanimoto(vec1, vec2):
|
||||
vec1 = hex_to_bin(vec1)
|
||||
vec2 = hex_to_bin(vec2)
|
||||
# print(vec1,vec2)
|
||||
nc = float(np.inner(vec1, vec2))
|
||||
n1 = float(np.sum(vec1))
|
||||
n2 = float(np.sum(vec2))
|
||||
dist = nc/(n1+n2-nc)
|
||||
print(nc,n1,n2)
|
||||
return dist
|
||||
|
||||
|
||||
def get_ground_truth_l2(topk, idx, vct_nq):
|
||||
filenames = os.listdir(BASE_FOLDER_NAME)
|
||||
filenames.sort()
|
||||
|
@ -85,7 +97,7 @@ def get_ground_truth_l2(topk, idx, vct_nq):
|
|||
for j in range(len(vec_list)):
|
||||
dist = calEuclideanDistance(vct_nq, vec_list[j])
|
||||
num_j = "%01d%03d%06d" % (8, k, j)
|
||||
if j < topk and k == 0:
|
||||
if k==0 and j<topk :
|
||||
no_dist[num_j] = dist
|
||||
else:
|
||||
# sorted by values
|
||||
|
@ -94,8 +106,9 @@ def get_ground_truth_l2(topk, idx, vct_nq):
|
|||
if dist < max_value:
|
||||
m = no_dist.pop(max_key)
|
||||
no_dist[num_j] = dist
|
||||
k = k + 1
|
||||
k += 1
|
||||
no_dist = sorted(no_dist.items(), key=lambda x: x[1])
|
||||
print(no_dist)
|
||||
save_gt_file(no_dist, idx)
|
||||
|
||||
|
||||
|
@ -108,8 +121,8 @@ def get_ground_truth_ip(topk, idx, vct_nq):
|
|||
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
|
||||
for j in range(len(vec_list)):
|
||||
dist = calInnerDistance(vct_nq, vec_list[j])
|
||||
num_j = "%01d%03d%06d" % (8, k, j)
|
||||
if j < topk and k == 0:
|
||||
num_j = "%03d%06d" % (k, j)
|
||||
if k==0 and j<topk :
|
||||
no_dist[num_j] = dist
|
||||
else:
|
||||
min_key = min(no_dist, key=no_dist.get)
|
||||
|
@ -117,55 +130,70 @@ def get_ground_truth_ip(topk, idx, vct_nq):
|
|||
if dist > min_value:
|
||||
m = no_dist.pop(min_key)
|
||||
no_dist[num_j] = dist
|
||||
k = k + 1
|
||||
k += 1
|
||||
no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
|
||||
print(no_dist)
|
||||
save_gt_file(no_dist, idx)
|
||||
|
||||
|
||||
def get_ground_truth_tanimoto(topk, idx, vec_nq):
|
||||
filenames = os.listdir(BASE_FOLDER_NAME) # get the whole file names
|
||||
filenames.sort()
|
||||
no_dist = {}
|
||||
k = 0
|
||||
for filename in filenames:
|
||||
vec_list = load_vec_list(BASE_FOLDER_NAME + '/' + filename)
|
||||
print(BASE_FOLDER_NAME + '/' + filename, len(vec_list))
|
||||
for j in range(len(vec_list)):
|
||||
dist = calTanimoto(vec_nq, vec_list[j])
|
||||
num_j = "%03d%06d" % (k, j)
|
||||
if k==0 and j<topk :
|
||||
no_dist[num_j] = dist
|
||||
else:
|
||||
min_key = min(no_dist, key=no_dist.get)
|
||||
min_value = no_dist[min_key]
|
||||
if dist > min_value:
|
||||
m = no_dist.pop(min_key)
|
||||
no_dist[num_j] = dist
|
||||
k += 1
|
||||
no_dist = sorted(no_dist.items(), key=lambda x: x[1], reverse=True)
|
||||
print(no_dist)
|
||||
save_gt_file(no_dist, idx)
|
||||
|
||||
|
||||
def save_gt_file(no_dist, idx):
|
||||
s = "%05d" % idx
|
||||
idx_fname = GT_ALL_FOLDER_NAME + '/' + s + '_idx.txt'
|
||||
dis_fname = GT_ALL_FOLDER_NAME + '/' + s + '_dis.txt'
|
||||
with open(idx_fname, 'w') as f:
|
||||
filename = "%05d" % idx + 'results.txt'
|
||||
with open(GT_ALL_FOLDER_NAME+'/'+filename, 'w') as f:
|
||||
for re in no_dist:
|
||||
f.write(str(re[0]) + '\n')
|
||||
f.write('\n')
|
||||
with open(dis_fname, 'w') as f:
|
||||
for re in no_dist:
|
||||
f.write(str(re[1]) + '\n')
|
||||
f.write('\n')
|
||||
|
||||
f.write(str(re[0]) + ' ' + str(re[1]) + '\n')
|
||||
|
||||
def get_loc_txt(file):
|
||||
filenames = os.listdir(GT_ALL_FOLDER_NAME)
|
||||
filenames.sort()
|
||||
write_file = open(GT_FOLDER_NAME + '/' + file, 'w+')
|
||||
for f in filenames:
|
||||
if f.endswith('_idx.txt'):
|
||||
f = GT_ALL_FOLDER_NAME + '/' + f
|
||||
for line in open(f, 'r'):
|
||||
write_file.write(line)
|
||||
for line in open(GT_ALL_FOLDER_NAME+'/'+f, 'r'):
|
||||
write_file.write(line)
|
||||
write_file.write('\n')
|
||||
|
||||
|
||||
def get_file_loc_txt(gt_file, fnames_file):
|
||||
gt_file = GT_FOLDER_NAME + '/' + gt_file
|
||||
fnames_file = GT_FOLDER_NAME + '/' + fnames_file
|
||||
filenames = os.listdir(BASE_FOLDER_NAME)
|
||||
filenames.sort()
|
||||
with open(gt_file, 'r') as gt_f:
|
||||
with open(fnames_file, 'w') as fnames_f:
|
||||
with open(GT_FOLDER_NAME+'/'+gt_file, 'r') as gt_f:
|
||||
with open(GT_FOLDER_NAME+'/'+fnames_file, 'w') as fnames_f:
|
||||
for line in gt_f:
|
||||
if line != '\n':
|
||||
line = line.strip()
|
||||
line = line.split()[0]
|
||||
loca = int(line[1:4])
|
||||
offset = int(line[4:10])
|
||||
fnames_f.write(filenames[loca] + ' ' + str(offset + 1) + '\n')
|
||||
else:
|
||||
fnames_f.write('\n')
|
||||
fnames_f.write(line)
|
||||
|
||||
|
||||
def load_gt_file_out():
|
||||
file_name = GT_FOLDER_NAME + '/' +FLOC_FILE_NAME
|
||||
file_name = GT_FOLDER_NAME + '/' + FLOC_FILE_NAME
|
||||
base_filename = []
|
||||
num = []
|
||||
with open(file_name, 'r') as f:
|
||||
|
@ -177,70 +205,36 @@ def load_gt_file_out():
|
|||
return base_filename, num
|
||||
|
||||
|
||||
def ground_truth_process(nq, topk):
|
||||
try:
|
||||
os.mkdir(GT_ALL_FOLDER_NAME)
|
||||
except:
|
||||
print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.')
|
||||
else:
|
||||
vectors = load_nq_vec(nq)
|
||||
print("query list:", len(vectors))
|
||||
processes = []
|
||||
process_num = PROCESS_NUM
|
||||
nq = len(vectors)
|
||||
loops = nq // process_num
|
||||
rest = nq % process_num
|
||||
if rest != 0:
|
||||
loops += 1
|
||||
time_start = time.time()
|
||||
for loop in range(loops):
|
||||
time1_start = time.time()
|
||||
base = loop * process_num
|
||||
if rest != 0 and loop == loops - 1:
|
||||
process_num = rest
|
||||
print('base:', loop)
|
||||
for i in range(process_num):
|
||||
print('nq_index:', base + i)
|
||||
if L2:
|
||||
if base + i == 0:
|
||||
print("get ground truth by L2.")
|
||||
process = Process(target=get_ground_truth_l2,
|
||||
args=(topk, base + i, vectors[base + i]))
|
||||
elif IP:
|
||||
if base + i == 0:
|
||||
print("get ground truth by IP.")
|
||||
process = Process(target=get_ground_truth_ip,
|
||||
args=(topk, base + i, vectors[base + i]))
|
||||
processes.append(process)
|
||||
process.start()
|
||||
for p in processes:
|
||||
p.join()
|
||||
time1_end = time.time()
|
||||
print("base", loop, "time_cost = ", round(time1_end - time1_start, 4))
|
||||
if not os.path.exists(GT_FOLDER_NAME):
|
||||
os.mkdir(GT_FOLDER_NAME)
|
||||
get_loc_txt(LOC_FILE_NAME)
|
||||
get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME)
|
||||
if GET_VEC:
|
||||
vec = []
|
||||
file, num = load_gt_file_out()
|
||||
for i in range(len(file)):
|
||||
n = int(num[i]) - 1
|
||||
vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i])
|
||||
vec.append(vectors[n])
|
||||
print("saved len of vec:", len(vec))
|
||||
np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec)
|
||||
time_end = time.time()
|
||||
time_cost = time_end - time_start
|
||||
print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!")
|
||||
def ground_truth_process(metric,nq_list, topk, num):
|
||||
thread_num = len(nq_list)
|
||||
with ProcessPoolExecutor(thread_num) as executor:
|
||||
for i in range(thread_num):
|
||||
# print("Process:",num+i)
|
||||
if metric == 'L2':
|
||||
executor.submit(get_ground_truth_l2, topk, num+i, nq_list[i])
|
||||
elif metric == 'IP':
|
||||
executor.submit(get_ground_truth_ip, topk, num+i, nq_list[i])
|
||||
elif metric == 'Tan':
|
||||
executor.submit(get_ground_truth_tanimoto, topk, num+i, nq_list[i])
|
||||
get_loc_txt(LOC_FILE_NAME)
|
||||
get_file_loc_txt(LOC_FILE_NAME, FLOC_FILE_NAME)
|
||||
if GET_VEC:
|
||||
vec = []
|
||||
file, num = load_gt_file_out()
|
||||
for i in range(len(file)):
|
||||
n = int(num[i]) - 1
|
||||
vectors = load_vec_list(BASE_FOLDER_NAME + '/' + file[i])
|
||||
vec.append(vectors[n])
|
||||
print("saved len of vec:", len(vec))
|
||||
np.save(GT_FOLDER_NAME + '/' + VEC_FILE_NAME, vec)
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
opts, args = getopt.getopt(
|
||||
sys.argv[1:],
|
||||
"hlq:k:",
|
||||
["help", "nq=", "topk="],
|
||||
"hlq:k:m:",
|
||||
["help", "nq=", "topk=", "metric="],
|
||||
)
|
||||
except getopt.GetoptError:
|
||||
print("Usage: test.py [-q <nq>] -k <topk> -s")
|
||||
|
@ -254,9 +248,33 @@ def main():
|
|||
nq = int(opt_value)
|
||||
elif opt_name in ("-k", "--topk"):
|
||||
topk = int(opt_value)
|
||||
elif opt_name == "-l":
|
||||
ground_truth_process(nq, topk) # test.py [-q <nq>] -k <topk> -l
|
||||
sys.exit()
|
||||
elif opt_name in ("-m", "--metric"):
|
||||
metric = opt_value
|
||||
elif opt_name == "-l": # test.py [-q <nq>] -k <topk> -m -l
|
||||
try:
|
||||
os.mkdir(GT_ALL_FOLDER_NAME)
|
||||
except:
|
||||
print('there already exits folder named ' + GT_ALL_FOLDER_NAME + ', please delete it first.')
|
||||
sys.exit()
|
||||
if not os.path.exists(GT_FOLDER_NAME):
|
||||
os.mkdir(GT_FOLDER_NAME)
|
||||
|
||||
print("metric type is",metric)
|
||||
time_start = time.time()
|
||||
query_vectors = load_query_vec(nq)
|
||||
nq = len(query_vectors)
|
||||
print("query list:", len(query_vectors))
|
||||
num = math.ceil(nq/PROCESS_NUM)
|
||||
for i in range(num):
|
||||
print("start with round:",i+1)
|
||||
if i==num-1:
|
||||
ground_truth_process(metric, query_vectors[i*PROCESS_NUM:nq], topk, i*PROCESS_NUM)
|
||||
else:
|
||||
ground_truth_process(metric, query_vectors[i*PROCESS_NUM:i*PROCESS_NUM+PROCESS_NUM], topk, i*PROCESS_NUM)
|
||||
|
||||
time_end = time.time()
|
||||
time_cost = time_end - time_start
|
||||
print("total_time = ", round(time_cost, 4), "\nGet the ground truth successfully!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in New Issue