pdf_code/zzb_data_word/parse_docx.py

57 lines
2.1 KiB
Python
Raw Normal View History

2024-12-30 17:51:12 +08:00
from docx import Document
from pymilvus import MilvusClient
import requests
import json,time,os
directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'
client = MilvusClient(
uri= 'http://114.55.128.195:19530'
)
# 遍历目录
for filename in os.listdir(directory_path):
# 构建完整的文件路径
try:
file_path = os.path.join(directory_path, filename)
# 加载Word文档
print(file_path)
doc = Document(file_path)
text = ''
# 读取文档中的所有段落
i=0
data = []
# for para in doc.paragraphs:
for num in range(200, len(doc.paragraphs)-200):
# 将段落文本添加到当前段落片段中
try:
text += doc.paragraphs[num].text
# 当当前段落片段长度超过500时将其添加到数组中并重置当前段落片段
if len(text) > 500:
i += 1
response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
res_json = json.loads(response.text)
if res_json["code"] == 200:
vector = res_json["data"][0]
measure_data = {}
measure_data['vector'] = vector
measure_data['text'] = text
measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
data.append(measure_data)
text = ''
if (i > 20 or num == len(doc.paragraphs)-200-1):
res = client.insert(
collection_name="ydkf",
data=data
)
i = 0
data = []
except Exception as e:
print(e)
except Exception as e:
print(e)