57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
from docx import Document
|
||
from pymilvus import MilvusClient
|
||
import requests
|
||
import json,time,os
|
||
|
||
directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'
|
||
|
||
|
||
client = MilvusClient(
|
||
uri= 'http://114.55.128.195:19530'
|
||
)
|
||
# 遍历目录
|
||
for filename in os.listdir(directory_path):
|
||
# 构建完整的文件路径
|
||
try:
|
||
file_path = os.path.join(directory_path, filename)
|
||
|
||
# 加载Word文档
|
||
print(file_path)
|
||
doc = Document(file_path)
|
||
|
||
text = ''
|
||
# 读取文档中的所有段落
|
||
i=0
|
||
data = []
|
||
# for para in doc.paragraphs:
|
||
for num in range(200, len(doc.paragraphs)-200):
|
||
# 将段落文本添加到当前段落片段中
|
||
try:
|
||
text += doc.paragraphs[num].text
|
||
# 当当前段落片段长度超过500时,将其添加到数组中,并重置当前段落片段
|
||
if len(text) > 500:
|
||
i += 1
|
||
response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
|
||
res_json = json.loads(response.text)
|
||
if res_json["code"] == 200:
|
||
vector = res_json["data"][0]
|
||
|
||
measure_data = {}
|
||
measure_data['vector'] = vector
|
||
measure_data['text'] = text
|
||
measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
|
||
data.append(measure_data)
|
||
text = ''
|
||
|
||
if (i > 20 or num == len(doc.paragraphs)-200-1):
|
||
res = client.insert(
|
||
collection_name="ydkf",
|
||
data=data
|
||
)
|
||
i = 0
|
||
data = []
|
||
except Exception as e:
|
||
print(e)
|
||
except Exception as e:
|
||
print(e)
|
||
|