当前位置：首页 > news >正文

【Milvus】向量数据库pymilvus使用教程

news 2025/7/1 20:00:00

以下是根据 Milvus 官方文档整理的详细 PyMilvus 使用教程，基于 Milvus 2.5.x 版本：

PyMilvus 使用教程

from pymilvus import (FieldSchema, CollectionSchema, DataType,Collection
)# 定义字段
fields = [FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=128),FieldSchema(name="age", dtype=DataType.INT32)
]# 创建Schema
schema = CollectionSchema(fields, description="人脸特征向量库")# 创建Collection
collection = Collection(name="face_db", schema=schema)

参数说明：

auto_id: 是否自动生成主键
dim: 向量维度（必须与后续插入数据维度一致）

插入数据

import random# 生成随机数据
num_entities = 1000
vectors = [[random.random() for _ in range(128)] for _ in range(num_entities)]
ages = [random.randint(18, 65) for _ in range(num_entities)]# 构造插入数据
data = [vectors,  # 对应embedding字段ages       # 对应age字段
]# 插入数据
insert_result = collection.insert(data)# 获取自动生成的ID
print(insert_result.primary_keys)

创建索引

index_params = {"index_type": "IVF_FLAT","metric_type": "L2","params": {"nlist": 128}
}collection.create_index(field_name="embedding",index_params=index_params
)

常用索引类型：

FLAT: 精确搜索
IVF_FLAT: 平衡型
HNSW: 高召回率
DISKANN: 磁盘存储优化

向量搜索

# 加载集合到内存
collection.load()# 准备搜索向量
search_vector = [random.random() for _ in range(128)]# 构建搜索参数
search_params = {"metric_type": "L2","params": {"nprobe": 10}
}# 执行搜索
results = collection.search(data=[search_vector],anns_field="embedding",param=search_params,limit=5,output_fields=["age"]  # 返回的额外字段
)# 解析结果
for hits in results:for hit in hits:print(f"ID: {hit.id}, 距离: {hit.distance}, Age: {hit.entity.get('age')}")

删除操作

# 删除实体
expr = "age >= 60"
collection.delete(expr)# 删除集合
collection.drop()

完整示例

from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection# 连接服务
connections.connect(host='localhost', port='19530')# 创建集合
fields = [FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=128)
]
schema = CollectionSchema(fields)
collection = Collection("test_collection", schema)# 插入数据
data = [[[random.random() for _ in range(128)] for _ in range(1000)]]
collection.insert(data)# 创建索引
index_params = {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 100}}
collection.create_index("vector", index_params)
collection.load()# 搜索
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
results = collection.search(data=[[0.5]*128],anns_field="vector",param=search_params,limit=3
)# 输出结果
print("搜索结果:")
for hits in results:for hit in hits:print(f"ID: {hit.id}, 距离: {hit.distance}")# 清理
collection.drop()