from docx import Document from pymilvus import MilvusClient import requests import json,time,os directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批' client = MilvusClient( uri= 'http://114.55.128.195:19530' ) # 遍历目录 for filename in os.listdir(directory_path): # 构建完整的文件路径 try: file_path = os.path.join(directory_path, filename) # 加载Word文档 print(file_path) doc = Document(file_path) text = '' # 读取文档中的所有段落 i=0 data = [] # for para in doc.paragraphs: for num in range(200, len(doc.paragraphs)-200): # 将段落文本添加到当前段落片段中 try: text += doc.paragraphs[num].text # 当当前段落片段长度超过500时,将其添加到数组中,并重置当前段落片段 if len(text) > 500: i += 1 response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"}) res_json = json.loads(response.text) if res_json["code"] == 200: vector = res_json["data"][0] measure_data = {} measure_data['vector'] = vector measure_data['text'] = text measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx' data.append(measure_data) text = '' if (i > 20 or num == len(doc.paragraphs)-200-1): res = client.insert( collection_name="ydkf", data=data ) i = 0 data = [] except Exception as e: print(e) except Exception as e: print(e)