pdf_code/zzb_data_word/parse_docx.py

57 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from docx import Document
from pymilvus import MilvusClient
import requests
import json,time,os
directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'
client = MilvusClient(
uri= 'http://114.55.128.195:19530'
)
# 遍历目录
for filename in os.listdir(directory_path):
# 构建完整的文件路径
try:
file_path = os.path.join(directory_path, filename)
# 加载Word文档
print(file_path)
doc = Document(file_path)
text = ''
# 读取文档中的所有段落
i=0
data = []
# for para in doc.paragraphs:
for num in range(200, len(doc.paragraphs)-200):
# 将段落文本添加到当前段落片段中
try:
text += doc.paragraphs[num].text
# 当当前段落片段长度超过500时将其添加到数组中并重置当前段落片段
if len(text) > 500:
i += 1
response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
res_json = json.loads(response.text)
if res_json["code"] == 200:
vector = res_json["data"][0]
measure_data = {}
measure_data['vector'] = vector
measure_data['text'] = text
measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
data.append(measure_data)
text = ''
if (i > 20 or num == len(doc.paragraphs)-200-1):
res = client.insert(
collection_name="ydkf",
data=data
)
i = 0
data = []
except Exception as e:
print(e)
except Exception as e:
print(e)