261 lines
10 KiB
Python
261 lines
10 KiB
Python
#coding=utf-8
|
||
import sys,ast
|
||
# from pdfminer.high_level import extract_text
|
||
# from pdfminer.pdfparser import PDFParser
|
||
# from pdfminer.pdfdocument import PDFDocument
|
||
# from pdfminer.pdfpage import PDFPage
|
||
import utils
|
||
import mysql.connector
|
||
# from pymilvus import connections,MilvusClient
|
||
import json,time
|
||
# import db_service
|
||
import ast
|
||
import numpy as np
|
||
import config_p
|
||
import redis_service
|
||
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||
# import main
|
||
import redis
|
||
|
||
def run_job(sec):
|
||
time.sleep(sec)
|
||
|
||
def measure_config_to_db(conn,cursor):
|
||
insert_query = '''
|
||
INSERT INTO measure_config_half_year
|
||
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
'''
|
||
# 打开文本文件
|
||
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
|
||
# 读取所有行到一个列表中
|
||
lines = file.readlines()
|
||
|
||
# 打印每一行
|
||
for line in lines:
|
||
config_list = line.strip().split(',')
|
||
measure = config_list[0]
|
||
ori_measure = config_list[1]
|
||
ori_measure_id = utils.get_md5(ori_measure)
|
||
|
||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
|
||
cursor.execute(insert_query, data_to_insert)
|
||
conn.commit()
|
||
|
||
def insert_measure_vector(conn,cursor):
|
||
|
||
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
|
||
# 执行SQL语句,更新数据
|
||
select_query = '''
|
||
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
|
||
'''
|
||
select_query = '''
|
||
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
|
||
'''
|
||
cursor.execute(select_query)
|
||
records = cursor.fetchall()
|
||
for record in records:
|
||
if redis_client.hexists('measure_config', record[0]):
|
||
measure_vector = redis_client.hget('measure_config', record[0])
|
||
else:
|
||
print('新增指标',record[1])
|
||
vector_obj = utils.embed_with_str(record[1])
|
||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||
|
||
redis_client.hset('measure_config', record[0], measure_vector)
|
||
redis_client.close()
|
||
conn.close()
|
||
|
||
# def contains_financial_indicators(text):
|
||
# import re
|
||
# # 正则表达式模式匹配千分位格式的数字和百分比
|
||
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
|
||
|
||
# pattern1 = r"\d+(.\d+)+%?"
|
||
# # 使用 re.search 函数查找匹配项
|
||
# match = re.search(pattern1, text)
|
||
|
||
# # 如果找到匹配项,返回 True,否则返回 False
|
||
# return bool(match)
|
||
|
||
# def get_clean_text(text):
|
||
# import re
|
||
# pattern = r"\([^)]*?\)"
|
||
# matches = re.findall(pattern, text)
|
||
# for match in matches:
|
||
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
|
||
# month_keywords_found = re.search(r"归属于|扣非", match)
|
||
# if not month_keywords_found:
|
||
# # 如果包含,则从文本中删除该部分
|
||
# text = re.sub(pattern,"", text)
|
||
# else:
|
||
# # 如果不包含,删除所有标点符号和中文数字
|
||
# text = re.sub(r"[^\w\s]", "", text)
|
||
# print(text)
|
||
|
||
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
|
||
# # #通过向量查询指标
|
||
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
|
||
|
||
# # #指标归一化处理
|
||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||
|
||
# def print_measure_data(cursor,client):
|
||
# select_query = '''
|
||
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
|
||
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
|
||
# '''
|
||
# cursor.execute(select_query)
|
||
# records = cursor.fetchall()
|
||
# for record in records:
|
||
# ori_measure_name = record[0]
|
||
# measure_name = record[1]
|
||
# ori_measure_id = record[2]
|
||
# measure_vector = redis_service.read_from_redis(ori_measure_id)
|
||
|
||
# measure_list = ast.literal_eval(measure_vector)
|
||
# data = [measure_list]
|
||
# res = client.search(
|
||
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
|
||
# # Replace with your query vector
|
||
# data=data,
|
||
# limit=2, # Max. number of search results to return
|
||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||
# output_fields=["measure_name","measure_value","table_num","table_index"],
|
||
# filter = 'file_id == "64"'
|
||
# )
|
||
# vector_str = measure_name+":"+ori_measure_name
|
||
# # Convert the output to a formatted JSON string
|
||
# for i in range(len(res[0])):
|
||
|
||
# vector_distance = float(res[0][i]["distance"])
|
||
# vector_measure_name = res[0][i]["entity"]["measure_name"]
|
||
# measure_value = res[0][i]["entity"]["measure_value"]
|
||
# table_num = res[0][i]["entity"]["table_num"]
|
||
# table_index = res[0][i]["entity"]["table_index"]
|
||
# table_num_list = [106]
|
||
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
|
||
# # if vector_distance > 0.89 and table_num not in table_num_list:
|
||
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
|
||
# # if vector_distance > distance and table_num not in table_num_list:
|
||
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
|
||
# print(vector)
|
||
conn = mysql.connector.connect(
|
||
host=MYSQL_HOST,
|
||
user=MYSQL_USER,
|
||
password=MYSQL_PASSWORD,
|
||
database=MYSQL_DB
|
||
)
|
||
cursor = conn.cursor()
|
||
|
||
# measure_config_to_db(conn,cursor)
|
||
|
||
insert_measure_vector(conn,cursor)
|
||
|
||
# cursor.close()
|
||
# conn.close()
|
||
# import re
|
||
# text = '减少11.04百分点'
|
||
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
|
||
# print('找到了单位。')
|
||
|
||
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
|
||
|
||
# match = unit_pattern.search(text)
|
||
# print(len(match.groups()))
|
||
|
||
# if match:
|
||
# print(f'找到单位。')
|
||
# else:
|
||
# print(f'没有找到单位。')
|
||
# row1 = ['比例','比率','占比','费用']
|
||
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
|
||
|
||
# for i in range(len(row1)):
|
||
# for j in range(len(row2)):
|
||
# print(f"{row1[i]}{row2[j]}")
|
||
# import os,re
|
||
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
|
||
|
||
# # 获取文件名和扩展名
|
||
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
|
||
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
|
||
# file_base_name = re.sub(r'\d+', '', file_base_name)
|
||
# print(f'文件名: {file_base_name}')
|
||
# import re
|
||
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
|
||
# import threading
|
||
|
||
# # 创建一个ThreadLocal变量
|
||
# local_data = threading.local()
|
||
|
||
# # 定义一个线程执行的工作函数
|
||
# def worker():
|
||
# # 为当前线程的ThreadLocal变量设置一个值
|
||
# local_data.data = f"Thread {threading.current_thread().name}'s data"
|
||
# print(local_data.data)
|
||
|
||
# # 创建并启动多个线程
|
||
# threads = []
|
||
# for i in range(3):
|
||
# thread = threading.Thread(target=worker)
|
||
# thread.start()
|
||
# threads.append(thread)
|
||
|
||
# # 等待所有线程完成
|
||
# for thread in threads:
|
||
# thread.join()
|
||
# for i in range(2,5):
|
||
# print(i)
|
||
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
|
||
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
|
||
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
|
||
# client = MilvusClient(
|
||
# uri= MILVUS_CLIENT
|
||
# )
|
||
# conn = mysql.connector.connect(
|
||
# host=MYSQL_HOST,
|
||
# user=MYSQL_USER,
|
||
# password=MYSQL_PASSWORD,
|
||
# database=MYSQL_DB
|
||
# )
|
||
# cursor = conn.cursor()
|
||
# print_measure_data(cursor,client)
|
||
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
|
||
# redis_service.read_from_redis()
|
||
# parent_table_pages = []
|
||
# file_id = '67'
|
||
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
|
||
|
||
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
|
||
|
||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||
|
||
# main.get_table_measure(path,'all',file_id)
|
||
|
||
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
|
||
|
||
|
||
# measure_config_to_db(conn,cursor)
|
||
# params = ['f_102','f_103',]
|
||
# for param in params:
|
||
# globals()[param] = param.replace('f_','')
|
||
# # insert_measure_vector(conn,cursor)
|
||
# print(globals()['f_102'])
|
||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||
|
||
# conn.commit()
|
||
# cursor.close()
|
||
# conn.close()
|
||
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
|
||
# count_range_parts = utils.get_range(2300)
|
||
|
||
# print(count_range_parts)
|