#coding=utf-8 import sys,ast # from pdfminer.high_level import extract_text # from pdfminer.pdfparser import PDFParser # from pdfminer.pdfdocument import PDFDocument # from pdfminer.pdfpage import PDFPage import utils import mysql.connector # from pymilvus import connections,MilvusClient import json,time # import db_service import ast import numpy as np import config_p import redis_service from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD # import main import redis def run_job(sec): time.sleep(sec) def measure_config_to_db(conn,cursor): insert_query = ''' INSERT INTO measure_config_half_year (measure_id, measure_name, ori_measure_id, ori_measure_name,year) VALUES (%s, %s, %s, %s, %s) ''' # 打开文本文件 with open('measure_config_all.txt', 'r',encoding='utf-8') as file: # 读取所有行到一个列表中 lines = file.readlines() # 打印每一行 for line in lines: config_list = line.strip().split(',') measure = config_list[0] ori_measure = config_list[1] ori_measure_id = utils.get_md5(ori_measure) data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024') cursor.execute(insert_query, data_to_insert) conn.commit() def insert_measure_vector(conn,cursor): # redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6) # 执行SQL语句,更新数据 select_query = ''' SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024' ''' select_query = ''' SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023' ''' cursor.execute(select_query) records = cursor.fetchall() for record in records: if redis_client.hexists('measure_config', record[0]): measure_vector = redis_client.hget('measure_config', record[0]) else: print('新增指标',record[1]) vector_obj = utils.embed_with_str(record[1]) measure_vector = str(vector_obj.output["embeddings"][0]["embedding"]) redis_client.hset('measure_config', record[0], measure_vector) redis_client.close() conn.close() # def contains_financial_indicators(text): # import re # # 正则表达式模式匹配千分位格式的数字和百分比 # pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?" # pattern1 = r"\d+(.\d+)+%?" # # 使用 re.search 函数查找匹配项 # match = re.search(pattern1, text) # # 如果找到匹配项,返回 True,否则返回 False # return bool(match) # def get_clean_text(text): # import re # pattern = r"\([^)]*?\)" # matches = re.findall(pattern, text) # for match in matches: # # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词 # month_keywords_found = re.search(r"归属于|扣非", match) # if not month_keywords_found: # # 如果包含,则从文本中删除该部分 # text = re.sub(pattern,"", text) # else: # # 如果不包含,删除所有标点符号和中文数字 # text = re.sub(r"[^\w\s]", "", text) # print(text) # def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path): # # #通过向量查询指标 # db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path) # # #指标归一化处理 # db_service.update_ori_measure(conn,cursor,file_id) # def print_measure_data(cursor,client): # select_query = ''' # SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config # where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64') # ''' # cursor.execute(select_query) # records = cursor.fetchall() # for record in records: # ori_measure_name = record[0] # measure_name = record[1] # ori_measure_id = record[2] # measure_vector = redis_service.read_from_redis(ori_measure_id) # measure_list = ast.literal_eval(measure_vector) # data = [measure_list] # res = client.search( # collection_name="pdf_measure_v4", # Replace with the actual name of your collection # # Replace with your query vector # data=data, # limit=2, # Max. number of search results to return # search_params={"metric_type": "COSINE", "params": {}}, # Search parameters # output_fields=["measure_name","measure_value","table_num","table_index"], # filter = 'file_id == "64"' # ) # vector_str = measure_name+":"+ori_measure_name # # Convert the output to a formatted JSON string # for i in range(len(res[0])): # vector_distance = float(res[0][i]["distance"]) # vector_measure_name = res[0][i]["entity"]["measure_name"] # measure_value = res[0][i]["entity"]["measure_value"] # table_num = res[0][i]["entity"]["table_num"] # table_index = res[0][i]["entity"]["table_index"] # table_num_list = [106] # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)) # # if vector_distance > 0.89 and table_num not in table_num_list: # # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94)) # # if vector_distance > distance and table_num not in table_num_list: # # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance)) if __name__ == "__main__": # redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6) # vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a') # print(vector) conn = mysql.connector.connect( host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASSWORD, database=MYSQL_DB ) cursor = conn.cursor() measure_config_to_db(conn,cursor) # insert_measure_vector(conn,cursor) # cursor.close() # conn.close() # import re # text = '减少11.04百分点' # if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text): # print('找到了单位。') # unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点') # match = unit_pattern.search(text) # print(len(match.groups())) # if match: # print(f'找到单位。') # else: # print(f'没有找到单位。') # row1 = ['比例','比率','占比','费用'] # row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例'] # for i in range(len(row1)): # for j in range(len(row2)): # print(f"{row1[i]}{row2[j]}") # import os,re # file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx' # # 获取文件名和扩展名 # file_base_name, file_extension = os.path.splitext(os.path.basename(file_path)) # file_base_name = file_base_name.replace("_", "").replace("\d+", "") # file_base_name = re.sub(r'\d+', '', file_base_name) # print(f'文件名: {file_base_name}') # import re # print(len(re.findall('母公司|现金流量表补充', '补充资料'))) # import threading # # 创建一个ThreadLocal变量 # local_data = threading.local() # # 定义一个线程执行的工作函数 # def worker(): # # 为当前线程的ThreadLocal变量设置一个值 # local_data.data = f"Thread {threading.current_thread().name}'s data" # print(local_data.data) # # 创建并启动多个线程 # threads = [] # for i in range(3): # thread = threading.Thread(target=worker) # thread.start() # threads.append(thread) # # 等待所有线程完成 # for thread in threads: # thread.join() # for i in range(2,5): # print(i) # file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF' # file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH) # redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6) # print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08')) # client = MilvusClient( # uri= MILVUS_CLIENT # ) # conn = mysql.connector.connect( # host=MYSQL_HOST, # user=MYSQL_USER, # password=MYSQL_PASSWORD, # database=MYSQL_DB # ) # cursor = conn.cursor() # print_measure_data(cursor,client) # redis_service.read_from_file_and_write_to_redis(conn,cursor)vim # redis_service.read_from_redis() # parent_table_pages = [] # file_id = '67' # path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf' # db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path) # db_service.update_ori_measure(conn,cursor,file_id) # main.get_table_measure(path,'all',file_id) # insert_and_update(conn,cursor,client,parent_table_pages,file_id,path) # measure_config_to_db(conn,cursor) # params = ['f_102','f_103',] # for param in params: # globals()[param] = param.replace('f_','') # # insert_measure_vector(conn,cursor) # print(globals()['f_102']) # db_service.update_ori_measure(conn,cursor,file_id) # conn.commit() # cursor.close() # conn.close() # # print(utils.get_md5('当期营业收入,2023年营业收入')) # count_range_parts = utils.get_range(2300) # print(count_range_parts)