pdf_code/zzb_data_word/config_init.py

261 lines
10 KiB
Python
Raw Normal View History

2024-12-30 17:51:12 +08:00
#coding=utf-8
import sys,ast
# from pdfminer.high_level import extract_text
# from pdfminer.pdfparser import PDFParser
# from pdfminer.pdfdocument import PDFDocument
# from pdfminer.pdfpage import PDFPage
import utils
import mysql.connector
# from pymilvus import connections,MilvusClient
import json,time
# import db_service
import ast
import numpy as np
import config_p
import redis_service
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
# import main
import redis
def run_job(sec):
time.sleep(sec)
def measure_config_to_db(conn,cursor):
insert_query = '''
INSERT INTO measure_config_half_year
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
VALUES (%s, %s, %s, %s, %s)
'''
# 打开文本文件
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
# 读取所有行到一个列表中
lines = file.readlines()
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor):
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
# 执行SQL语句更新数据
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
'''
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
# def contains_financial_indicators(text):
# import re
# # 正则表达式模式匹配千分位格式的数字和百分比
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
# pattern1 = r"\d+(.\d+)+%?"
# # 使用 re.search 函数查找匹配项
# match = re.search(pattern1, text)
# # 如果找到匹配项,返回 True否则返回 False
# return bool(match)
# def get_clean_text(text):
# import re
# pattern = r"\[^)]*?\"
# matches = re.findall(pattern, text)
# for match in matches:
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
# month_keywords_found = re.search(r"归属于|扣非", match)
# if not month_keywords_found:
# # 如果包含,则从文本中删除该部分
# text = re.sub(pattern,"", text)
# else:
# # 如果不包含,删除所有标点符号和中文数字
# text = re.sub(r"[^\w\s]", "", text)
# print(text)
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
# # #通过向量查询指标
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
# # #指标归一化处理
# db_service.update_ori_measure(conn,cursor,file_id)
# def print_measure_data(cursor,client):
# select_query = '''
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
# '''
# cursor.execute(select_query)
# records = cursor.fetchall()
# for record in records:
# ori_measure_name = record[0]
# measure_name = record[1]
# ori_measure_id = record[2]
# measure_vector = redis_service.read_from_redis(ori_measure_id)
# measure_list = ast.literal_eval(measure_vector)
# data = [measure_list]
# res = client.search(
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=2, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name","measure_value","table_num","table_index"],
# filter = 'file_id == "64"'
# )
# vector_str = measure_name+":"+ori_measure_name
# # Convert the output to a formatted JSON string
# for i in range(len(res[0])):
# vector_distance = float(res[0][i]["distance"])
# vector_measure_name = res[0][i]["entity"]["measure_name"]
# measure_value = res[0][i]["entity"]["measure_value"]
# table_num = res[0][i]["entity"]["table_num"]
# table_index = res[0][i]["entity"]["table_index"]
# table_num_list = [106]
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
# # if vector_distance > 0.89 and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
# # if vector_distance > distance and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
if __name__ == "__main__":
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
# print(vector)
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# measure_config_to_db(conn,cursor)
insert_measure_vector(conn,cursor)
# cursor.close()
# conn.close()
# import re
# text = '减少11.04百分点'
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
# print('找到了单位。')
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
# match = unit_pattern.search(text)
# print(len(match.groups()))
# if match:
# print(f'找到单位。')
# else:
# print(f'没有找到单位。')
# row1 = ['比例','比率','占比','费用']
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
# for i in range(len(row1)):
# for j in range(len(row2)):
# print(f"{row1[i]}{row2[j]}")
# import os,re
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
# # 获取文件名和扩展名
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
# file_base_name = re.sub(r'\d+', '', file_base_name)
# print(f'文件名: {file_base_name}')
# import re
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
# import threading
# # 创建一个ThreadLocal变量
# local_data = threading.local()
# # 定义一个线程执行的工作函数
# def worker():
# # 为当前线程的ThreadLocal变量设置一个值
# local_data.data = f"Thread {threading.current_thread().name}'s data"
# print(local_data.data)
# # 创建并启动多个线程
# threads = []
# for i in range(3):
# thread = threading.Thread(target=worker)
# thread.start()
# threads.append(thread)
# # 等待所有线程完成
# for thread in threads:
# thread.join()
# for i in range(2,5):
# print(i)
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
# client = MilvusClient(
# uri= MILVUS_CLIENT
# )
# conn = mysql.connector.connect(
# host=MYSQL_HOST,
# user=MYSQL_USER,
# password=MYSQL_PASSWORD,
# database=MYSQL_DB
# )
# cursor = conn.cursor()
# print_measure_data(cursor,client)
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
# redis_service.read_from_redis()
# parent_table_pages = []
# file_id = '67'
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
# db_service.update_ori_measure(conn,cursor,file_id)
# main.get_table_measure(path,'all',file_id)
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
# measure_config_to_db(conn,cursor)
# params = ['f_102','f_103',]
# for param in params:
# globals()[param] = param.replace('f_','')
# # insert_measure_vector(conn,cursor)
# print(globals()['f_102'])
# db_service.update_ori_measure(conn,cursor,file_id)
# conn.commit()
# cursor.close()
# conn.close()
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
# count_range_parts = utils.get_range(2300)
# print(count_range_parts)