57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
|
from docx import Document
|
|||
|
from pymilvus import MilvusClient
|
|||
|
import requests
|
|||
|
import json,time,os
|
|||
|
|
|||
|
directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'
|
|||
|
|
|||
|
|
|||
|
client = MilvusClient(
|
|||
|
uri= 'http://114.55.128.195:19530'
|
|||
|
)
|
|||
|
# 遍历目录
|
|||
|
for filename in os.listdir(directory_path):
|
|||
|
# 构建完整的文件路径
|
|||
|
try:
|
|||
|
file_path = os.path.join(directory_path, filename)
|
|||
|
|
|||
|
# 加载Word文档
|
|||
|
print(file_path)
|
|||
|
doc = Document(file_path)
|
|||
|
|
|||
|
text = ''
|
|||
|
# 读取文档中的所有段落
|
|||
|
i=0
|
|||
|
data = []
|
|||
|
# for para in doc.paragraphs:
|
|||
|
for num in range(200, len(doc.paragraphs)-200):
|
|||
|
# 将段落文本添加到当前段落片段中
|
|||
|
try:
|
|||
|
text += doc.paragraphs[num].text
|
|||
|
# 当当前段落片段长度超过500时,将其添加到数组中,并重置当前段落片段
|
|||
|
if len(text) > 500:
|
|||
|
i += 1
|
|||
|
response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
|
|||
|
res_json = json.loads(response.text)
|
|||
|
if res_json["code"] == 200:
|
|||
|
vector = res_json["data"][0]
|
|||
|
|
|||
|
measure_data = {}
|
|||
|
measure_data['vector'] = vector
|
|||
|
measure_data['text'] = text
|
|||
|
measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
|
|||
|
data.append(measure_data)
|
|||
|
text = ''
|
|||
|
|
|||
|
if (i > 20 or num == len(doc.paragraphs)-200-1):
|
|||
|
res = client.insert(
|
|||
|
collection_name="ydkf",
|
|||
|
data=data
|
|||
|
)
|
|||
|
i = 0
|
|||
|
data = []
|
|||
|
except Exception as e:
|
|||
|
print(e)
|
|||
|
except Exception as e:
|
|||
|
print(e)
|
|||
|
|