昆仑django向量数据库原理详解

apps/ai/urls

path('/knowledge', views.Knowledge.as_view()),

apps/ai/models

class Knowledge(BaseModel):
    title = models.CharField(max_length=255)
    content = models.TextField(blank=True, null=True)
    published_at = models.IntegerField(null=True)
    type = models.IntegerField(default=0)
    entity = models.JSONField(blank=True, null=True)

    class Meta:
        db_table = 'knowledge'

apps/ai/views

class Knowledge(APIView):
    """
    get: 知识库详情
    post: 创建知识库
    put: 编辑知识库
    delete: 删除知识库
    """

    # @method_decorator(cache_page(1 * 10))
    def get(self, request):
        id = request.query_params.get('id')
        if not id:
            return response_util.miss_param(msg='id: 丢失')

        obj = models.Knowledge.objects.filter(id=id)
        # print(id)
        # print(obj)
        if not obj.exists():
            return response_util.no_data()
        data = dict(
            id=id,
            meta=dict(
                mid=id,
                type=obj.first().type,
                title=obj.first().title,
                content=obj.first().content,
                published_at=obj.first().published_at,
            ),
        )

        return response_util.ok(data=data)

    def post(self, request):
        form = forms.KnowledgePost(request.data)
        if not form.is_valid():
            return response_util.param_error(msg=functions.errors_to_str(form.errors))

        source_type = request.data.get('source_type', 'file')
        content_type = request.data.get('type')
        status = request.data.get('status', True)
        # status = False
        results = []
        # 如果是文件,我就进行解析
        if source_type == 'file':
            for uploaded_file in request.FILES.getlist('source'):
                filename, ext = os.path.splitext(uploaded_file.name)
                try:
                    text = self.parse_file(uploaded_file, ext)
                except ValueError as e:
                    return response_util.error(msg=str(e))

                results.append({'title': filename,
                                'type': ext.strip("."), 'content_type': content_type, 'content': text})
        # 如果是url,我就输入title,并进行解析
        elif source_type == 'url':
            url = request.data.get('url')
            if not url:
                return response_util.error(msg='Missing URL parameter')
            try:
                title = request.data.get('title')
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                text = soup.get_text()
                results = [{'title': title, 'type': 'url',
                            'content_type': content_type, 'content': text}]
            except requests.exceptions.RequestException:
                return response_util.error(msg='Failed to fetch website content')
        # 如果是表单提交我就直接解析
        elif source_type == 'form':
            title = request.data.get('title')
            content = request.data.get('content')
            results = [{'title': title, 'type': 'form',
                        'content_type': content_type, 'content': content}]
        else:
            return response_util.error(msg='Invalid source type')

        # 如果状态是True那么我就存入向量数据库
        mid = 0  # 默认为0
        mid_list = []
        # print(results)
        # status = False
        if status:
            for data in results:
                # print(data)
                title, type, content = data.get(
                    'title'), data.get('content_type'), data.get('content')
                published_at = int(time.time())
                # print(title, type, content, published_at)
                # 实体提取
                entity_list = dict(list=[
                    f"key_{e['meta'][0]['type']}_{e['meta'][0]['id']}" for e in functions.get_entity(title)])

                # 插入mysql
                obj = models.Knowledge(
                    type=type,
                    title=title,
                    content=content,
                    published_at=published_at,
                    entity=entity_list,
                )
                obj.save()
                mid = obj.id
                # print(mid)
                mid_list.append(mid)
                # mysql插入完毕,进行向量插入
                chunk_size = int(request.data.get('chunk_size') or 300)
                chunk_overlap = int(request.POST.get('chunk_overlap') or 20)
                # docs_l = []

                # 做相应的处理
                # Text Splitter
                from langchain.text_splitter import RecursiveCharacterTextSplitter
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=chunk_size, chunk_overlap=chunk_overlap)
                docs = text_splitter.split_text(content)
                # print(len(docs))
                # summarize = functions.get_summarize(content)
                data = []
                for doc in docs:
                    info_id = str(uuid.uuid4())
                    vector = functions.get_embedding(' '.join([title, doc]))
                    if vector:
                        data.append({
                            "id": info_id,
                            "content_vector": vector,
                            "type": int(type),
                            "published_at": published_at,
                            "mid": int(mid) if mid else 0,
                            "meta": {
                                "title": title,
                                "content": doc,
                            },
                            "entity": entity_list,
                        })
                    # 插入向量库
                client = zilliz()
                collection_name = 'knowledge'
                client.insert_single_entity(collection_name, data)

            # return response_util.ok(data=dict(id=mid_list, results=data))

        return response_util.ok(data=dict(id=mid_list, results=results))

    def put(self, request):
        # 编辑知识库
        form = forms.KnowledgePut(request.data)
        # print(form)
        if not form.is_valid():
            return response_util.param_error(msg=functions.errors_to_str(form.errors))
        # form_data = dict(request.data)
        id = request.data.get("id")
        type = request.data.get("type")
        client = zilliz()
        collection_name = 'knowledge'
        res = client.query(f'mid == {int(id)}',
                           collection_name, 100, 0, ['id'])
        obj = models.Knowledge.objects.filter(id=id)
        # print(res, obj.exists())
        if not obj.exists():
            return response_util.no_data(msg='id: 不存在')
        title = request.data.get('title', obj.first().title)
        content = request.data.get('content', obj.first().content)
        # type = form_data.get('type', obj.first().type)
        # print(title, content, type)
        obj.update(title=title, content=content, type=type)
        # print(id)
        #
        if title != obj.first().title:
            # 实体提取
            entity_list = dict(list=[
                               f"key_{e['meta'][0]['type']}_{e['meta'][0]['id']}" for e in functions.get_entity(title)])
            obj.update(entity=entity_list)
        else:
            entity_list = obj.first().entity

        # print(entity_list)
        # 更新向量库
        client.delete(collection_name, [r['id'] for r in res])
        # 做相应的处理
        # Text Splitter
        from langchain.text_splitter import RecursiveCharacterTextSplitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200, chunk_overlap=20)
        docs = text_splitter.split_text(content)
        data = []
        for doc in docs:
            info_id = str(uuid.uuid4())
            vector = functions.get_embedding(' '.join([title, doc]))
            if vector:
                info_id = str(uuid.uuid4())
                data.append({
                    "id": info_id,
                    "content_vector": vector,
                    "type": int(type),
                    "published_at": int(obj.first().published_at),
                    "mid": int(id),
                    "meta": {
                        "title": title,
                        "content": doc,
                    },
                    "entity": {
                        "lsit": entity_list
                    },
                })
        client.insert_single_entity(collection_name, data)
        return response_util.ok()

    def delete(self, request):
        form = forms.KnowledgeDel(request.data)
        if not form.is_valid():
            return response_util.param_error(msg=functions.errors_to_str(form.errors))
        # form_data = dict(request.data)
        id = int(request.data.get('id'))

        obj = models.Knowledge.objects.filter(id=id)

        client = zilliz()
        collection_name = 'knowledge'
        res = client.query(f'mid == {id}', collection_name, 100, 0, ['id'])

        if not obj.exists():
            return response_util.no_data(msg='id: 不存在')
        client.delete(collection_name, [r['id'] for r in res])
        obj.delete()
        return response_util.ok()

    def parse_file(self, uploaded_file, ext):
        from docx import Document
        from io import BytesIO

        file_content = uploaded_file.read()
        if ext == '.docx':
            doc = Document(BytesIO(file_content))
            text = '\n'.join([para.text for para in doc.paragraphs])
        elif ext == '.pdf':
            text = self.pdftotxt(BytesIO(file_content))
        elif ext == '.txt':
            text = file_content.decode()
        elif ext == '.md':
            text = markdown.markdown(file_content.decode())
        else:
            raise ValueError('Unsupported file type')
        return text

    def pdftotxt(self, fp):
        from pdfminer.high_level import extract_text
        text = extract_text(fp)
        return text

get: 知识库详情
post: 创建知识库
put: 编辑知识库
delete: 删除知识库
接口路径
https://tkapi.coregem.net/ai/knowledge

get: 知识库详情

get接口需要根据向量数据库的id获取文章具体信息

在这里插入图片描述

在这里插入图片描述

post: 创建知识库

post接口提供三种上传方式,分别为多文件上传,url,和表单上传

文件上传

文件类型支持doc,md,pdf,txt四种类型
在这里插入图片描述

url上传

在这里插入图片描述

表单上传

在这里插入图片描述

put: 编辑知识库

在这里插入图片描述
当修改之后再进行查询会发现文件已经被修改
在这里插入图片描述

delete: 删除知识库

在这里插入图片描述
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值