06-FAISS向量数据库

原创于 2025-12-16 15:09:16 发布 · 310 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#faiss #python #开发语言

Embedding和RAG 专栏收录该内容

7 篇文章

订阅专栏

①.基于 FAISS（Facebook 开源的高效向量检索库）和 LangChain 构建的中文文本向量检索系统，核心功能是将文本数据向量化后存入 FAISS 向量库，并实现相似性检索。

1.安装FAISS

pip install faiss-cpu，也可以是gpu，看具体使用情况

2.实现代码

import faiss
from langchain_community.docstore import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document


model_name = "BAAI/bge-large-zh-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
bge_hf_embedding = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

#1.初始化数据库
#先创建索引
#向量维度
index=faiss.IndexFlatL2(1024)

vector_store=FAISS(
    embedding_function=bge_hf_embedding,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

# 2、 准备数据（Document）
#page_content里面的数据必须经过向量化，不需要向量化的数据放在metadata
document_1 = Document(
    page_content="今天早餐我吃了巧克力薄煎饼和炒蛋。",
    metadata={"source": "tweet", "time": "上午"},
)

document_2 = Document(
    page_content="明天的天气预报是阴天多云，最高气温62华氏度。",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="正在用LangChain构建一个激动人心的新项目——快来看看吧！",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="劫匪闯入城市银行，盗走了100万美元现金。",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="哇！那部电影太精彩了，我已经迫不及待想再看一遍。",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="新iPhone值得这个价格吗？阅读这篇评测一探究竟。",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="当今世界排名前十的足球运动员。",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph是构建有状态智能体应用的最佳框架！",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="由于对经济衰退的担忧，今日股市下跌500点。",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="我有种不好的预感，我要被删除了 :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

ids=['id'+str(i+1) for i in range(len(documents))]

vector_store.add_documents(documents,ids=ids)

#把数据库写入磁盘
vector_store.save_local('../faiss_db')

#语言检索
resp=vector_store.similarity_search('今天的投资建议',2)
for i in resp:
    print(i.page_content)
    print(type(i))

输出：

左边的文件栏也会出现保存的数据库：

②.从保存的向量数据库中进行检索

from langchain_community.docstore import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
from sentence_transformers import SentenceTransformer
from langchain_core.documents import Document
import faiss

class CustomQwen3Embeddings(Embeddings):
    '''
    定义一个Qwen3的Embedding和lang'chain整合的类
    '''
    def __init__(self,model_name):
        self.qwen3_embedding = SentenceTransformer(model_name)

    #输入的问题向量化
    def embed_query(self, text: str) -> list[float]:
        return self.embed_documents([text])[0]

    #文本内容向量化
    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        return self.qwen3_embedding.encode(texts)

qwen3=CustomQwen3Embeddings('Qwen/Qwen3-Embedding-0.6B')


#加载数据库
vector_store=FAISS.load_local('../faiss_db',embeddings=qwen3,allow_dangerous_deserialization=True)

resp = vector_store.similarity_search_with_score('有美食的内容吗', k=4, filter={"source": 'tweet'})  # 带分数
for i,score in resp:
    print(type(i))
    print(i)
    print(i.id)
    print(f"{score:3f}")

输出：