pdf_code/zzb_data_word/parse_docx.py

from docx import Document
from pymilvus import MilvusClient
import requests
import json,time,os

directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'


client = MilvusClient(
    uri= 'http://114.55.128.195:19530'
)
    # 遍历目录
for filename in os.listdir(directory_path):
    # 构建完整的文件路径
    try:
        file_path = os.path.join(directory_path, filename)

        # 加载Word文档
        print(file_path)
        doc = Document(file_path)

        text = ''
        # 读取文档中的所有段落
        i=0
        data = []
        # for para in doc.paragraphs:
        for num in range(200, len(doc.paragraphs)-200):
            # 将段落文本添加到当前段落片段中
            try:
                text += doc.paragraphs[num].text
                # 当当前段落片段长度超过500时，将其添加到数组中，并重置当前段落片段
                if len(text) > 500:
                    i += 1
                    response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
                    res_json = json.loads(response.text)
                    if res_json["code"] == 200:
                        vector = res_json["data"][0]   
                    
                    measure_data = {}
                    measure_data['vector'] = vector
                    measure_data['text'] = text
                    measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
                    data.append(measure_data)
                    text = ''

                if (i > 20 or num == len(doc.paragraphs)-200-1):
                    res = client.insert(
                        collection_name="ydkf",
                        data=data
                    )
                    i = 0
                    data = []
            except Exception as e:
                print(e)
    except Exception as e:
        print(e)