Skip to content

Commit 143b1e5

Browse files
committed
faiss
1 parent b553de9 commit 143b1e5

9 files changed

+312
-98
lines changed

Faiss的使用/001-使用欧式距离和内积的方式检索.py

-57
This file was deleted.
+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
@file : 001-欧式距离检索.py
3+
@author : xiaolu
4+
@email : luxiaonlp@163.com
5+
@time : 2021-08-09
6+
"""
7+
import faiss
8+
import numpy as np
9+
10+
11+
if __name__ == '__main__':
12+
n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度
13+
np.random.seed(43) # 随机种子 为了多次执行结果一致
14+
15+
# 检索库的构造
16+
data = []
17+
mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差
18+
for i in range(n_data):
19+
data.append(np.random.normal(mu, sigma, d))
20+
data = np.array(data).astype('float32') # faiss只支持32位的浮点数
21+
22+
# 检索向量的生成
23+
query = []
24+
n_query = 10 # 生成10个query向量
25+
mu, sigma = 3, 0.1
26+
np.random.seed(12)
27+
for i in range(n_query):
28+
query.append(np.random.normal(mu, sigma, d))
29+
query = np.array(query).astype('float32')
30+
31+
# 构建索引 记住要传入向量维度d
32+
index = faiss.IndexFlatL2(d)
33+
# print(index.is_trained) # 这里若是false就要训练 后面讲
34+
35+
# 添加数据
36+
index.add(data)
37+
# print(index.ntotal) # 总的数据量
38+
39+
# 开始检索
40+
k = 10 # 指定让其返回10个距离最近的
41+
42+
# 这里我们选取data中的前五个 容易看到结果,因为自己跟自己距离肯定为0 所以最相关的肯定是自己
43+
query_self = data[:5]
44+
45+
dis, ind = index.search(query_self, k=k)
46+
print(dis) # 每条数据代表了当前这个query 与最相关的十个数据的距离
47+
print(ind) # 每条数据代表了当前这个query 最相关的十条数据的索引
48+
"""
49+
[[0. 8.55197 8.634906 8.683499 8.698736 8.821949 8.902446
50+
8.943979 8.9516735 8.972908 ]
51+
[0. 8.369204 8.482748 8.53028 8.581224 8.680499 8.684254
52+
8.697291 8.719812 8.753435 ]
53+
[0. 8.209936 8.392483 8.456179 8.473589 8.480727 8.551348
54+
8.553277 8.576391 8.592704 ]
55+
[0. 8.473689 8.621014 8.827385 8.883725 8.980131 8.99064
56+
9.015673 9.017438 9.027972 ]
57+
[0. 8.268832 8.349455 8.597895 8.611757 8.658188 8.675722
58+
8.685029 8.70588 8.707612 ]]
59+
[[ 0 877 502 42 606 366 348 923 563 56]
60+
[ 1 849 974 106 348 364 877 242 280 173]
61+
[ 2 877 127 655 253 233 558 678 13 208]
62+
[ 3 421 94 348 502 402 536 646 563 735]
63+
[ 4 986 230 209 446 889 974 241 550 248]]
64+
"""
65+
66+
67+

Faiss的使用/002-IndexIVFFlat检索.py

-41
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""
2+
@file : 002-倒排表快速索引.py
3+
@author : xiaolu
4+
@email : luxiaonlp@163.com
5+
@time : 2021-08-09
6+
"""
7+
import numpy as np
8+
import faiss
9+
10+
if __name__ == '__main__':
11+
n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度
12+
np.random.seed(43) # 随机种子 为了多次执行结果一致
13+
14+
# 检索库的构造
15+
data = []
16+
mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差
17+
for i in range(n_data):
18+
data.append(np.random.normal(mu, sigma, d))
19+
data = np.array(data).astype('float32') # faiss只支持32位的浮点数
20+
21+
# 检索向量的生成
22+
query = []
23+
n_query = 10 # 生成10个query向量
24+
mu, sigma = 3, 0.1
25+
np.random.seed(12)
26+
for i in range(n_query):
27+
query.append(np.random.normal(mu, sigma, d))
28+
query = np.array(query).astype('float32')
29+
30+
nlist = 50 # 将数据库向量分割为多少了维诺空间
31+
k = 10
32+
quantizer = faiss.IndexFlatL2(d) # 量化器
33+
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) # METRIC_L2计算L2距离, 或faiss.METRIC_INNER_PRODUCT计算内积
34+
assert not index.is_trained # 倒排表索引类型需要训练
35+
index.train(data) # 训练数据集应该与数据库数据集同分布
36+
assert index.is_trained
37+
38+
index.add(data)
39+
index.nprobe = 2 # 选择n个维诺空间进行索引,
40+
dis, ind = index.search(query, k)
41+
print(dis)
42+
print(ind)
+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""
2+
@file : 003-乘积量化索引.py
3+
@author : xiaolu
4+
@email : luxiaonlp@163.com
5+
@time : 2021-08-09
6+
"""
7+
import numpy as np
8+
import faiss
9+
10+
if __name__ == '__main__':
11+
n_data, d = 1000, 512 # 检索库中的向量个数, 每个向量的维度
12+
np.random.seed(43) # 随机种子 为了多次执行结果一致
13+
14+
# 检索库的构造
15+
data = []
16+
mu, sigma = 3, 0.1 # 这里时通过高斯分布随机产生若干向量,这两个参数为均值和方差
17+
for i in range(n_data):
18+
data.append(np.random.normal(mu, sigma, d))
19+
data = np.array(data).astype('float32') # faiss只支持32位的浮点数
20+
21+
# 检索向量的生成
22+
query = []
23+
n_query = 10 # 生成10个query向量
24+
mu, sigma = 3, 0.1
25+
np.random.seed(12)
26+
for i in range(n_query):
27+
query.append(np.random.normal(mu, sigma, d))
28+
query = np.array(query).astype('float32')
29+
30+
nlist = 50
31+
m = 8 # 列方向划分个数,必须能被d整除
32+
k = 10
33+
quantizer = faiss.IndexFlatL2(d)
34+
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 4) # 4 表示每个子向量被编码为 4 bits
35+
36+
index.train(data)
37+
index.add(data)
38+
index.nprobe = 50
39+
dis, ind = index.search(data[:10], k) # 查询自身
40+
print(dis)
41+
print(ind)
42+
43+
dis, ind = index.search(query, k) # 真实查询
44+
print(dis)
45+
print(ind)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
"""
2+
@file : 004-faiss实现kmeans聚类.py
3+
@author : xiaolu
4+
@email : luxiaonlp@163.com
5+
@time : 2021-08-09
6+
"""
7+
import faiss
8+
import numpy as np
9+
10+
11+
if __name__ == '__main__':
12+
# 数据
13+
n_data, d = 2000, 512
14+
np.random.seed(43)
15+
data = []
16+
mu, sigma = 3, 0.1
17+
for i in range(n_data):
18+
data.append(np.random.normal(mu, sigma, d))
19+
data = np.array(data).astype('float32')
20+
21+
# 聚类
22+
n_centroids = 1024 # 聚类中心个数
23+
d = data.shape[1]
24+
kmeans = faiss.Kmeans(d, n_centroids)
25+
kmeans.train(data)
26+
# 输出聚类中心
27+
# print(kmeans.centroids)
28+
# print(len(kmeans.centroids))
29+
30+
# 看data中的前五个向量属于那个类(最有可能的两个类)
31+
D, I = kmeans.index.search(data[:5], k=2)
32+
print(D) # 与每个类的距离
33+
print(I) # 类的编号
34+
"""
35+
输出:
36+
[[4.1553707 5.2924204]
37+
[1.9329664 4.930997 ]
38+
[4.537619 4.8509283]
39+
[4.6700296 5.2252126]
40+
[2.101182 4.9292693]]
41+
[[478 568]
42+
[767 697]
43+
[568 527]
44+
[999 568]
45+
[175 853]]
46+
"""
47+
48+
print('*'*100)
49+
# 计算每个中心最近的若干条向量
50+
k = 5
51+
index = faiss.IndexFlatL2(d)
52+
index.add(data)
53+
D, I = index.search(kmeans.centroids, k)
54+
print(D)
55+
print(I)
56+
57+
58+
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""
2+
@file : 005-faiss实现pca降维.py
3+
@author : xiaolu
4+
@email : luxiaonlp@163.com
5+
@time : 2021-08-09
6+
"""
7+
import faiss
8+
import numpy as np
9+
10+
11+
if __name__ == '__main__':
12+
# 数据
13+
n_data, d = 2000, 512
14+
np.random.seed(43)
15+
data = []
16+
mu, sigma = 3, 0.1
17+
for i in range(n_data):
18+
data.append(np.random.normal(mu, sigma, d))
19+
data = np.array(data).astype('float32')
20+
21+
mat = faiss.PCAMatrix(512, 64) # 从512维降为64维
22+
mat.train(data)
23+
assert mat.is_trained
24+
tr = mat.apply_py(data)
25+
print(tr.shape)
26+
print(tr)
27+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""
2+
@file : 006-faiss实现PQ编码和解码.py
3+
@author : xiaolu
4+
@email : luxiaonlp@163.com
5+
@time : 2021-08-09
6+
"""
7+
import faiss
8+
import numpy as np
9+
10+
11+
if __name__ == '__main__':
12+
# 数据
13+
n_data, d = 2000, 512
14+
np.random.seed(43)
15+
data = []
16+
mu, sigma = 3, 0.1
17+
for i in range(n_data):
18+
data.append(np.random.normal(mu, sigma, d))
19+
data = np.array(data).astype('float32')
20+
21+
cs = 4 # code size (bytes)
22+
# 训练数据集
23+
x = data # 原始的数据集
24+
25+
x_train = data # 训练集
26+
pq = faiss.ProductQuantizer(d, cs, 8)
27+
pq.train(x_train)
28+
29+
# encode编码
30+
codes = pq.compute_codes(x)
31+
32+
# decode解码
33+
x2 = pq.decode(codes)
34+
35+
# 编码-解码后与原始数据的差
36+
avg_relative_error = ((x - x2)**2).sum() / (x ** 2).sum()
37+
print(avg_relative_error)

0 commit comments

Comments
 (0)