文章目录
Milvus 文档非常丰富,先使用 Python SDK 上手操作、看效果,更容易快速理解;然后可以看理论行的描述。
PyMilvus 子文档并不是按概念的逻辑顺序组织,而是按方法/属性名字排序。本文基于 v2.4.x。
官方文档地址:https://milvus.io/docs/zh
Client
from pymilvus import MilvusClient
# Authentication not enabled
client = MilvusClient("http://localhost:19530")
# Authentication enabled with the root user
client = MilvusClient(
uri="http://localhost:19530",
token="root:Milvus",
db_name="default"
)
关闭连接
client.close()
Role
client.create_role(role_name="read_only")
无返回
>>> client.list_roles()
['admin', 'public', 'read_only']
client.drop_role('read_only')
User
创建用户
client.create_user(user_name="user_1", password="psw123")
查看所有用户
>>> client.list_users()
['root', 'user_1']
# 删除用户
client.drop_user('user_1')
还可以使用 Privilege 相关方法,对用户进行授权,这里不展示。
Schema
from pymilvus import MilvusClient, DataType
# 1. Create a schema
schema = MilvusClient.create_schema(
auto_id=False,
enable_dynamic_field=False,
)
schema.add_field(field_name="my_id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="my_vector", datatype=DataType.FLOAT_VECTOR, dim=5)
from pymilvus import CollectionSchema, FieldSchema, DataType
primary_key = FieldSchema(
name="id",
dtype=DataType.INT64,
is_primary=True,
)
vector = FieldSchema(
name="vector",
dtype=DataType.FLOAT_VECTOR,
dim=768
)
schema = CollectionSchema(
fields=[primary_key, vector],
description="example_schema"
)
# Add a scalar field to the schema
schema.add_field(
field_name="scalar_01",
datatype=DataType.INT32
)
construct_from_dict & to_dict
# Create dictionary representation
schema_dict = {
"fields": [
primary_key.to_dict(),
vector.to_dict()
]
}
schema = CollectionSchema.construct_from_dict(schema_dict)
schema_dict = schema.to_dict()
Collection
client.create_collection(
collection_name="test_collection",
dimension=5
)
client.create_collection(
collection_name="quick_setup",
dimension=5,
primary_field_name="my_id",
id_type="string",
vector_field_name="my_vector",
metric_type="L2",
auto_id=True,
max_length=512
)
from pymilvus import MilvusClient, DataType
# 1. Create schema
schema = MilvusClient.create_schema(
auto_id=False,
enable_dynamic_field=False,
)
# 2. Add fields to schema
schema.add_field(field_name="my_id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="my_vector", datatype=DataType.FLOAT_VECTOR, dim=5)
# 3. Prepare index parameters
index_params = client.prepare_index_params()
# 4. Add indexes
index_params.add_index(
field_name="my_id",
index_type="STL_SORT"
)
index_params.add_index(
field_name="my_vector",
index_type="AUTOINDEX",
metric_type="L2",
params={"nlist": 1024}
)
# 5. Create a collection
client.create_collection(
collection_name="customized_setup",
schema=schema,
index_params=index_params
)
describe_collection('')
{
'collection_name': 'test_collection',
'auto_id': False,
'num_shards': 1,
'description': '',
'fields': [{
'field_id': 100,
'name': 'id',
'description': '',
'type': < DataType.INT64: 5 > ,
'params': {},
'is_primary': True
}, {
'field_id': 101,
'name': 'vector',
'description': '',
'type': < DataType.FLOAT_VECTOR: 101 > ,
'params': {
'dim': 5
}
}],
'aliases': [],
'collection_id': 452739317276996078,
'consistency_level': 2,
'properties': {},
'num_partitions': 1,
'enable_dynamic_field': True
}
删除
# 查看所有 collection
client.list_collections()
# 查看状态
client.get_collection_stats(collection_name="test_collection")
# 删除
client.drop_collection(collection_name="test_collection")
# 重命名
client.rename_collection(
old_name="test_collection",
new_name="test_collection_renamed"
)
Index
index_params = client.prepare_index_params()
# <pymilvus.milvus_client.index.IndexParams object at 0x127454d50>
# 4. Add indexes
# - For a scalar field
index_params.add_index(
field_name="my_id",
index_type="STL_SORT"
)
# - For a vector field
index_params.add_index(
field_name="my_vector",
index_type="IVF_FLAT",
metric_type="L2",
params={"nlist": 1024}
)
# >>> list(index_params)
#[{'field_name': 'my_id', 'index_type': 'STL_SORT', 'index_name': ''}, {'field_name': 'my_vector', 'index_type': 'IVF_FLAT', 'index_name': '', 'metric_type': 'L2', 'params': {'nlist': 1024}}]
# 5. Create a collection
client.create_collection(
collection_name="customized_setup",
schema=schema
)
# 6. Create indexes
client.create_index(
collection_name="customized_setup",
index_params=index_params,
sync=False
)
client.list_indexes(collection_name="customized_setup")
# ['my_id', 'my_vector']
# 查看描述
client.describe_index(
collection_name="customized_setup",
index_name="my_id"
)
# -> {'index_type': 'STL_SORT', 'field_name': 'my_id', 'index_name': 'my_id', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}
# 删除
client.drop_index(
collection_name="customized_setup",
index_name="my_id"
)
load & status
# 7. Load indexes
client.load_collection(
collection_name="customized_setup",
replica_number=2
)
client.refresh_load(
collection_name="test_collection"
)
client.release_collection(
collection_name="customized_setup"
)
查看状态
collection or partition
client.get_load_state(collection_name="customized_setup")
# -> {'state': <LoadState: NotLoad>}
partition
# 创建
client.create_partition(
collection_name="test_collection",
partition_name="partition_A"
)
# 加载到内存
client.load_partitions(
collection_name="test_collection",
partition_names=["partition_A"]
)
# 查看加载状态
client.get_load_state(collection_name="test_collection")
# 从内存释放
client.release_partitions(
collection_name="test_collection",
partition_names=["partition_A"]
)
# 删除
client.drop_partition(
collection_name="test_collection",
partition_name="partition_A"
)
# 查看状态
client.get_partition_stats(
collection_name="test_collection",
partition_name="_default"
)
# 判断是否存在分区
client.has_partition(
collection_name="test_collection",
partition_name="partition_A"
)
# 显示所有分区
client.list_partitions(
collection_name="test_collection",
)
Vector
# 插入数据
client.insert(
collection_name="test_collection",
data=[
{"id": 0, "vector": [0.5, 0.09, 0.2, 0.15, 0.05], "color": "green"},
{"id": 1, "vector": [0.04, 0.09, 0.33, 0.03, 0.35], "color": "blue"},
{"id": 2, "vector": [0.1, 0.21, 0.41, 0.36, 0.9], "color": "orange"},
...
{"id": 8, "vector": [0.12, 0.16, 0.25, 0.2, 0.16], "color": "pink"},
{"id": 9, "vector": [0.07, 0.38, 0.36, 0.03, 0.47], "color": "brown"}
]
)
res = client.upsert(
collection_name="test_collection",
data=[
{
'id': 1,
'vector': [
0.3457690490452393,
-0.9401784221711342,
0.9123948134344333,
0.49519396415367245,
-0.558567588166478
]
},
{
'id': 2,
'vector': [
0.42349086179692356,
-0.533609076732849,
-0.8344432775467099,
0.675761846081416,
0.57094256393761057
]
}
]
)
# 获取数据
res = client.get(
collection_name="test_collection",
# ids=1,
ids=[2, 5, 8]
)
# 删除实体
client.delete(
collection_name="test_collection",
ids=[3, 6, 7]
)
# filter 删除
client.delete(
collection_name="test_collection",
filter="id in [1, 8, 9] and color like 'b%'"
)
# query
res = client.query(
collection_name="test_collection",
# filter="",
filter="id in [6,7,8]",
limit=5,
)
# search - 根据相似度
search_params = {
"metric_type": "IP",
"params": {}
}
search_params = {
"metric_type": "IP",
"params": {
"radius": 0.1,
"range_filter": 0.8
}
}
res = client.search(
collection_name="test_collection",
data=[[0.05, 0.23, 0.07, 0.45, 0.13]],
limit=3,
filter='color like "red%"',
search_params=search_params
)
search
collection = Collection(name='{your_collection_name}') # Replace with the actual name of your collection
res = collection.hybrid_search(
reqs=[
AnnSearchRequest(
data=[['{your_text_query_vector}']], # Replace with your text vector data
anns_field='{text_vector_field_name}', # Textual data vector field
param={"metric_type": "IP", "params": {"nprobe": 10}}, # Search parameters
limit=2
),
AnnSearchRequest(
data=[['{your_image_query_vector}']], # Replace with your image vector data
anns_field='{image_vector_field_name}', # Image data vector field
param={"metric_type": "IP", "params": {"nprobe": 10}}, # Search parameters
limit=2
)
],
# Use WeightedRanker to combine results with specified weights
rerank=WeightedRanker(0.8, 0.2), # Assign weights of 0.8 to text search and 0.2 to image search
# Alternatively, use RRFRanker for reciprocal rank fusion reranking
# rerank=RRFRanker(),
limit=2
)
res = collection.search(
data=[[0.1,0.2,-0.3,-0.4,0.5]],
anns_field="vector",
param=param,
batch_size=BATCH_SIZE,
limit=LIMIT,
expr="id > 3",
output_fields=["id", "vector"]
)
for hits in res:
hits.ids
hits.distances
for hit in hits:
hit.id
hit.distance
hit.vector
hit.get("vector")
2024-09-24(二)