PyMilvus - 基本使用

最新推荐文章于 2025-04-28 18:16:56 发布

富婆E

最新推荐文章于 2025-04-28 18:16:56 发布

阅读量1.1k

点赞数 14

分类专栏： # AI 极简实践文章标签： PyMilvus Milvus python connection Schema index 向量数据库

本文链接：https://blog.csdn.net/lovechris00/article/details/142481277

版权

AI 极简实践专栏收录该内容

22 篇文章

订阅专栏

文章目录

Milvus 文档非常丰富，先使用 Python SDK 上手操作、看效果，更容易快速理解；然后可以看理论行的描述。

PyMilvus 子文档并不是按概念的逻辑顺序组织，而是按方法/属性名字排序。本文基于 v2.4.x。

官方文档地址：https://milvus.io/docs/zh

Client

from pymilvus import MilvusClient

# Authentication not enabled
client = MilvusClient("http://localhost:19530")

# Authentication enabled with the root user
client = MilvusClient(
    uri="http://localhost:19530",
    token="root:Milvus",
    db_name="default"
)

关闭连接

client.close()

Role

client.create_role(role_name="read_only")

无返回

>>> client.list_roles()
['admin', 'public', 'read_only']

client.drop_role('read_only')

User

创建用户


client.create_user(user_name="user_1", password="psw123")

查看所有用户

>>> client.list_users()
['root', 'user_1']

# 删除用户 
client.drop_user('user_1')

还可以使用 Privilege 相关方法，对用户进行授权，这里不展示。

Schema

from pymilvus import MilvusClient, DataType

# 1. Create a schema
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=False,
)

schema.add_field(field_name="my_id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="my_vector", datatype=DataType.FLOAT_VECTOR, dim=5)

from pymilvus import CollectionSchema, FieldSchema, DataType

primary_key = FieldSchema(
    name="id",
    dtype=DataType.INT64,
    is_primary=True,
)

vector = FieldSchema(
    name="vector",
    dtype=DataType.FLOAT_VECTOR,
    dim=768
)

schema = CollectionSchema(
    fields=[primary_key, vector],
    description="example_schema"
)

# Add a scalar field to the schema
schema.add_field(
    field_name="scalar_01",
    datatype=DataType.INT32
)

construct_from_dict & to_dict

# Create dictionary representation 
schema_dict = {
    "fields": [     
        primary_key.to_dict(),
        vector.to_dict()                
    ]
}  

schema = CollectionSchema.construct_from_dict(schema_dict) 

schema_dict = schema.to_dict()

Collection

https://milvus.io/api-reference/pymilvus/v2.4.x/MilvusClient/Collections/create_collection.md

client.create_collection(
    collection_name="test_collection", 
    dimension=5
)

client.create_collection(
    collection_name="quick_setup",
    dimension=5,
    primary_field_name="my_id",
    id_type="string",
    vector_field_name="my_vector",
    metric_type="L2",
    auto_id=True,
    max_length=512
)

from pymilvus import MilvusClient, DataType

# 1. Create schema
schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=False,
)

# 2. Add fields to schema
schema.add_field(field_name="my_id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="my_vector", datatype=DataType.FLOAT_VECTOR, dim=5)

# 3. Prepare index parameters
index_params = client.prepare_index_params()

# 4. Add indexes
index_params.add_index(
    field_name="my_id",
    index_type="STL_SORT"
)

index_params.add_index(
    field_name="my_vector", 
    index_type="AUTOINDEX",
    metric_type="L2",
    params={"nlist": 1024}
)

# 5. Create a collection
client.create_collection(
    collection_name="customized_setup",
    schema=schema,
    index_params=index_params
)

describe_collection('')

{
	'collection_name': 'test_collection',
	'auto_id': False,
	'num_shards': 1,
	'description': '',
	'fields': [{
      'field_id': 100,
      'name': 'id',
      'description': '',
      'type': < DataType.INT64: 5 > ,
      'params': {},
      'is_primary': True
    }, {
      'field_id': 101,
      'name': 'vector',
      'description': '',
      'type': < DataType.FLOAT_VECTOR: 101 > ,
      'params': {
        'dim': 5
      }
    }],
	'aliases': [],
	'collection_id': 452739317276996078,
	'consistency_level': 2,
	'properties': {},
	'num_partitions': 1,
	'enable_dynamic_field': True
}

删除

# 查看所有 collection
client.list_collections()  

# 查看状态
client.get_collection_stats(collection_name="test_collection")

# 删除
client.drop_collection(collection_name="test_collection")

# 重命名 
client.rename_collection(
    old_name="test_collection",
    new_name="test_collection_renamed"
)

Index


index_params = client.prepare_index_params()
#  <pymilvus.milvus_client.index.IndexParams object at 0x127454d50> 

# 4. Add indexes
# - For a scalar field
index_params.add_index(
    field_name="my_id",
    index_type="STL_SORT"
)

# - For a vector field
index_params.add_index(
    field_name="my_vector", 
    index_type="IVF_FLAT",
    metric_type="L2",
    params={"nlist": 1024}
)

# >>> list(index_params)
#[{'field_name': 'my_id', 'index_type': 'STL_SORT', 'index_name': ''}, {'field_name': 'my_vector', 'index_type': 'IVF_FLAT', 'index_name': '', 'metric_type': 'L2', 'params': {'nlist': 1024}}]

# 5. Create a collection
client.create_collection(
    collection_name="customized_setup",
    schema=schema
)

# 6. Create indexes
client.create_index(
    collection_name="customized_setup",
    index_params=index_params,
    sync=False
)

client.list_indexes(collection_name="customized_setup")
# ['my_id', 'my_vector'] 

# 查看描述
client.describe_index(
    collection_name="customized_setup",
    index_name="my_id"    
)
# -> {'index_type': 'STL_SORT', 'field_name': 'my_id', 'index_name': 'my_id', 'total_rows': 0, 'indexed_rows': 0, 'pending_index_rows': 0, 'state': 'Finished'}

# 删除 
client.drop_index(
    collection_name="customized_setup", 
    index_name="my_id"
)

load & status


# 7. Load indexes
client.load_collection(
    collection_name="customized_setup",
    replica_number=2
)

client.refresh_load(
    collection_name="test_collection"
) 

client.release_collection(
    collection_name="customized_setup"
)

查看状态

collection or partition

client.get_load_state(collection_name="customized_setup") 
# -> {'state': <LoadState: NotLoad>}

partition

# 创建 
client.create_partition(
    collection_name="test_collection", 
    partition_name="partition_A"
)

# 加载到内存

client.load_partitions(
    collection_name="test_collection",
    partition_names=["partition_A"]
)

# 查看加载状态 
client.get_load_state(collection_name="test_collection") 

# 从内存释放
client.release_partitions(
    collection_name="test_collection",
    partition_names=["partition_A"]
)

# 删除 
client.drop_partition(
    collection_name="test_collection", 
    partition_name="partition_A"
)

# 查看状态 
client.get_partition_stats(
    collection_name="test_collection",
    partition_name="_default"
)

# 判断是否存在分区
client.has_partition(
    collection_name="test_collection", 
    partition_name="partition_A"
) 

# 显示所有分区 
client.list_partitions(
    collection_name="test_collection", 
)

Vector

# 插入数据
client.insert(
    collection_name="test_collection",
    data=[
        {"id": 0, "vector": [0.5, 0.09, 0.2, 0.15, 0.05], "color": "green"},
        {"id": 1, "vector": [0.04, 0.09, 0.33, 0.03, 0.35], "color": "blue"},
        {"id": 2, "vector": [0.1, 0.21, 0.41, 0.36, 0.9], "color": "orange"},
      ...
        {"id": 8, "vector": [0.12, 0.16, 0.25, 0.2, 0.16], "color": "pink"},
        {"id": 9, "vector": [0.07, 0.38, 0.36, 0.03, 0.47], "color": "brown"}
    ]
)

res = client.upsert(
    collection_name="test_collection",
    data=[
        {
            'id': 1,
             'vector': [
                 0.3457690490452393,
                 -0.9401784221711342,
                 0.9123948134344333,
                 0.49519396415367245,
                 -0.558567588166478
             ]
       },
       {
           'id': 2,
           'vector': [
               0.42349086179692356,
               -0.533609076732849,
               -0.8344432775467099,
               0.675761846081416,
               0.57094256393761057
           ]
       }
   ]
)

# 获取数据 
res = client.get(
    collection_name="test_collection",
    # ids=1,
    ids=[2, 5, 8]
)

# 删除实体
client.delete(
    collection_name="test_collection",
    ids=[3, 6, 7]
)

# filter 删除
client.delete(
    collection_name="test_collection",
    filter="id in [1, 8, 9] and color like 'b%'"
)

# query
res = client.query(
    collection_name="test_collection",
    # filter="",
    filter="id in [6,7,8]",
    limit=5,
) 

# search - 根据相似度 
search_params = {
    "metric_type": "IP",
    "params": {}
}

search_params = {
    "metric_type": "IP",
    "params": {
        "radius": 0.1,
        "range_filter": 0.8
    }
}

res = client.search(
    collection_name="test_collection",
    data=[[0.05, 0.23, 0.07, 0.45, 0.13]],
    limit=3,
    filter='color like "red%"',
    search_params=search_params
)

search

collection = Collection(name='{your_collection_name}') # Replace with the actual name of your collection

res = collection.hybrid_search(
    reqs=[
        AnnSearchRequest(
            data=[['{your_text_query_vector}']],  # Replace with your text vector data
            anns_field='{text_vector_field_name}',  # Textual data vector field
            param={"metric_type": "IP", "params": {"nprobe": 10}}, # Search parameters
            limit=2
        ),
        AnnSearchRequest(
            data=[['{your_image_query_vector}']],  # Replace with your image vector data
            anns_field='{image_vector_field_name}',  # Image data vector field
            param={"metric_type": "IP", "params": {"nprobe": 10}}, # Search parameters
            limit=2
        )
    ],
    # Use WeightedRanker to combine results with specified weights
    rerank=WeightedRanker(0.8, 0.2), # Assign weights of 0.8 to text search and 0.2 to image search
    # Alternatively, use RRFRanker for reciprocal rank fusion reranking
    # rerank=RRFRanker(),
    limit=2
)


res = collection.search(
    data=[[0.1,0.2,-0.3,-0.4,0.5]],
    anns_field="vector",
    param=param,
    batch_size=BATCH_SIZE,
    limit=LIMIT,
    expr="id > 3",
    output_fields=["id", "vector"]
)

for hits in res:
    
    hits.ids 
    hits.distances
    
    for hit in hits:
        
        hit.id 
        hit.distance  
        hit.vector 
        hit.get("vector")