#coding=utf-8 import sys,ast from pdfminer.high_level import extract_text from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage import utils import mysql.connector from pymilvus import connections,MilvusClient import json import db_service import ast import numpy as np import config import redis_service from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB import main import redis def measure_config_to_db(conn,cursor): insert_query = ''' INSERT INTO measure_config (measure_id, measure_name, ori_measure_id, ori_measure_name) VALUES (%s, %s, %s, %s) ''' check_query = ''' select ori_measure_id from measure_config ''' # 打开文本文件 with open('/Users/zhengfei/work/zzb_data/measure_config_all.txt', 'r') as file: # 读取所有行到一个列表中 lines = file.readlines() # 打印每一行 for line in lines: config_list = line.strip().split(',') measure = config_list[0] ori_measure = config_list[1] ori_measure_id = utils.get_md5(ori_measure) # 判断数据库中是否有数据 # cursor.execute(check_query.format(ori_measure_id=ori_measure_id)) # check_records = cursor.fetchall() # if(len(check_records)) > 0: # continue data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure) cursor.execute(insert_query, data_to_insert) conn.commit() def insert_measure_vector(conn,cursor): redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6) # 执行SQL语句,更新数据 select_query = ''' SELECT ori_measure_id,ori_measure_name FROM measure_config ''' cursor.execute(select_query) records = cursor.fetchall() for record in records: if redis_client.hexists('measure_config', record[0]): measure_vector = redis_client.hget('measure_config', record[0]) else: print('新增指标',record[1]) vector_obj = utils.embed_with_str(record[1]) measure_vector = str(vector_obj.output["embeddings"][0]["embedding"]) redis_client.hset('measure_config', record[0], measure_vector) redis_client.close() conn.close() def contains_financial_indicators(text): import re # 正则表达式模式匹配千分位格式的数字和百分比 pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?" pattern1 = r"\d+(.\d+)+%?" # 使用 re.search 函数查找匹配项 match = re.search(pattern1, text) # 如果找到匹配项,返回 True,否则返回 False return bool(match) def get_clean_text(text): import re pattern = r"\([^)]*?\)" matches = re.findall(pattern, text) for match in matches: # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词 month_keywords_found = re.search(r"归属于|扣非", match) if not month_keywords_found: # 如果包含,则从文本中删除该部分 text = re.sub(pattern,"", text) else: # 如果不包含,删除所有标点符号和中文数字 text = re.sub(r"[^\w\s]", "", text) print(text) def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path): # #通过向量查询指标 db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path) # #指标归一化处理 db_service.update_ori_measure(conn,cursor,file_id) def print_measure_data(cursor,client): select_query = ''' SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64') ''' cursor.execute(select_query) records = cursor.fetchall() for record in records: ori_measure_name = record[0] measure_name = record[1] ori_measure_id = record[2] measure_vector = redis_service.read_from_redis(ori_measure_id) measure_list = ast.literal_eval(measure_vector) data = [measure_list] res = client.search( collection_name="pdf_measure_v4", # Replace with the actual name of your collection # Replace with your query vector data=data, limit=2, # Max. number of search results to return search_params={"metric_type": "COSINE", "params": {}}, # Search parameters output_fields=["measure_name","measure_value","table_num","table_index"], filter = 'file_id == "64"' ) vector_str = measure_name+":"+ori_measure_name # Convert the output to a formatted JSON string for i in range(len(res[0])): vector_distance = float(res[0][i]["distance"]) vector_measure_name = res[0][i]["entity"]["measure_name"] measure_value = res[0][i]["entity"]["measure_value"] table_num = res[0][i]["entity"]["table_num"] table_index = res[0][i]["entity"]["table_index"] table_num_list = [106] print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)) # if vector_distance > 0.89 and table_num not in table_num_list: # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94)) # if vector_distance > distance and table_num not in table_num_list: # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance)) if __name__ == "__main__": conn = mysql.connector.connect( host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASSWORD, database=MYSQL_DB ) cursor = conn.cursor() insert_measure_vector(conn,cursor)