apps/ai/urls
path('/knowledge', views.Knowledge.as_view()),
apps/ai/models
class Knowledge(BaseModel):
title = models.CharField(max_length=255)
content = models.TextField(blank=True, null=True)
published_at = models.IntegerField(null=True)
type = models.IntegerField(default=0)
entity = models.JSONField(blank=True, null=True)
class Meta:
db_table = 'knowledge'
apps/ai/views
class Knowledge(APIView):
"""
get: 知识库详情
post: 创建知识库
put: 编辑知识库
delete: 删除知识库
"""
# @method_decorator(cache_page(1 * 10))
def get(self, request):
id = request.query_params.get('id')
if not id:
return response_util.miss_param(msg='id: 丢失')
obj = models.Knowledge.objects.filter(id=id)
# print(id)
# print(obj)
if not obj.exists():
return response_util.no_data()
data = dict(
id=id,
meta=dict(
mid=id,
type=obj.first().type,
title=obj.first().title,
content=obj.first().content,
published_at=obj.first().published_at,
),
)
return response_util.ok(data=data)
def post(self, request):
form = forms.KnowledgePost(request.data)
if not form.is_valid():
return response_util.param_error(msg=functions.errors_to_str(form.errors))
source_type = request.data.get('source_type', 'file')
content_type = request.data.get('type')
status = request.data.get('status', True)
# status = False
results = []
# 如果是文件,我就进行解析
if source_type == 'file':
for uploaded_file in request.FILES.getlist('source'):
filename, ext = os.path.splitext(uploaded_file.name)
try:
text = self.parse_file(uploaded_file, ext)
except ValueError as e:
return response_util.error(msg=str(e))
results.append({'title': filename,
'type': ext.strip("."), 'content_type': content_type, 'content': text})
# 如果是url,我就输入title,并进行解析
elif source_type == 'url':
url = request.data.get('url')
if not url:
return response_util.error(msg='Missing URL parameter')
try:
title = request.data.get('title')
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
results = [{'title': title, 'type': 'url',
'content_type': content_type, 'content': text}]
except requests.exceptions.RequestException:
return response_util.error(msg='Failed to fetch website content')
# 如果是表单提交我就直接解析
elif source_type == 'form':
title = request.data.get('title')
content = request.data.get('content')
results = [{'title': title, 'type': 'form',
'content_type': content_type, 'content': content}]
else:
return response_util.error(msg='Invalid source type')
# 如果状态是True那么我就存入向量数据库
mid = 0 # 默认为0
mid_list = []
# print(results)
# status = False
if status:
for data in results:
# print(data)
title, type, content = data.get(
'title'), data.get('content_type'), data.get('content')
published_at = int(time.time())
# print(title, type, content, published_at)
# 实体提取
entity_list = dict(list=[
f"key_{e['meta'][0]['type']}_{e['meta'][0]['id']}" for e in functions.get_entity(title)])
# 插入mysql
obj = models.Knowledge(
type=type,
title=title,
content=content,
published_at=published_at,
entity=entity_list,
)
obj.save()
mid = obj.id
# print(mid)
mid_list.append(mid)
# mysql插入完毕,进行向量插入
chunk_size = int(request.data.get('chunk_size') or 300)
chunk_overlap = int(request.POST.get('chunk_overlap') or 20)
# docs_l = []
# 做相应的处理
# Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_text(content)
# print(len(docs))
# summarize = functions.get_summarize(content)
data = []
for doc in docs:
info_id = str(uuid.uuid4())
vector = functions.get_embedding(' '.join([title, doc]))
if vector:
data.append({
"id": info_id,
"content_vector": vector,
"type": int(type),
"published_at": published_at,
"mid": int(mid) if mid else 0,
"meta": {
"title": title,
"content": doc,
},
"entity": entity_list,
})
# 插入向量库
client = zilliz()
collection_name = 'knowledge'
client.insert_single_entity(collection_name, data)
# return response_util.ok(data=dict(id=mid_list, results=data))
return response_util.ok(data=dict(id=mid_list, results=results))
def put(self, request):
# 编辑知识库
form = forms.KnowledgePut(request.data)
# print(form)
if not form.is_valid():
return response_util.param_error(msg=functions.errors_to_str(form.errors))
# form_data = dict(request.data)
id = request.data.get("id")
type = request.data.get("type")
client = zilliz()
collection_name = 'knowledge'
res = client.query(f'mid == {int(id)}',
collection_name, 100, 0, ['id'])
obj = models.Knowledge.objects.filter(id=id)
# print(res, obj.exists())
if not obj.exists():
return response_util.no_data(msg='id: 不存在')
title = request.data.get('title', obj.first().title)
content = request.data.get('content', obj.first().content)
# type = form_data.get('type', obj.first().type)
# print(title, content, type)
obj.update(title=title, content=content, type=type)
# print(id)
#
if title != obj.first().title:
# 实体提取
entity_list = dict(list=[
f"key_{e['meta'][0]['type']}_{e['meta'][0]['id']}" for e in functions.get_entity(title)])
obj.update(entity=entity_list)
else:
entity_list = obj.first().entity
# print(entity_list)
# 更新向量库
client.delete(collection_name, [r['id'] for r in res])
# 做相应的处理
# Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=200, chunk_overlap=20)
docs = text_splitter.split_text(content)
data = []
for doc in docs:
info_id = str(uuid.uuid4())
vector = functions.get_embedding(' '.join([title, doc]))
if vector:
info_id = str(uuid.uuid4())
data.append({
"id": info_id,
"content_vector": vector,
"type": int(type),
"published_at": int(obj.first().published_at),
"mid": int(id),
"meta": {
"title": title,
"content": doc,
},
"entity": {
"lsit": entity_list
},
})
client.insert_single_entity(collection_name, data)
return response_util.ok()
def delete(self, request):
form = forms.KnowledgeDel(request.data)
if not form.is_valid():
return response_util.param_error(msg=functions.errors_to_str(form.errors))
# form_data = dict(request.data)
id = int(request.data.get('id'))
obj = models.Knowledge.objects.filter(id=id)
client = zilliz()
collection_name = 'knowledge'
res = client.query(f'mid == {id}', collection_name, 100, 0, ['id'])
if not obj.exists():
return response_util.no_data(msg='id: 不存在')
client.delete(collection_name, [r['id'] for r in res])
obj.delete()
return response_util.ok()
def parse_file(self, uploaded_file, ext):
from docx import Document
from io import BytesIO
file_content = uploaded_file.read()
if ext == '.docx':
doc = Document(BytesIO(file_content))
text = '\n'.join([para.text for para in doc.paragraphs])
elif ext == '.pdf':
text = self.pdftotxt(BytesIO(file_content))
elif ext == '.txt':
text = file_content.decode()
elif ext == '.md':
text = markdown.markdown(file_content.decode())
else:
raise ValueError('Unsupported file type')
return text
def pdftotxt(self, fp):
from pdfminer.high_level import extract_text
text = extract_text(fp)
return text
get: 知识库详情
post: 创建知识库
put: 编辑知识库
delete: 删除知识库
接口路径
https://tkapi.coregem.net/ai/knowledge
get: 知识库详情
get接口需要根据向量数据库的id获取文章具体信息
post: 创建知识库
post接口提供三种上传方式,分别为多文件上传,url,和表单上传
文件上传
文件类型支持doc,md,pdf,txt四种类型
url上传
表单上传
put: 编辑知识库
当修改之后再进行查询会发现文件已经被修改
delete: 删除知识库