pdf_code/zzb_data_word/redis_init.py

261 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#coding=utf-8
import sys,ast
# from pdfminer.high_level import extract_text
# from pdfminer.pdfparser import PDFParser
# from pdfminer.pdfdocument import PDFDocument
# from pdfminer.pdfpage import PDFPage
import utils
import mysql.connector
# from pymilvus import connections,MilvusClient
import json,time
# import db_service
import ast
import numpy as np
import config_p
import redis_service
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
# import main
import redis
def run_job(sec):
time.sleep(sec)
def measure_config_to_db(conn,cursor):
insert_query = '''
INSERT INTO measure_config_half_year
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
VALUES (%s, %s, %s, %s, %s)
'''
# 打开文本文件
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
# 读取所有行到一个列表中
lines = file.readlines()
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor):
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
# 执行SQL语句更新数据
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
'''
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
# def contains_financial_indicators(text):
# import re
# # 正则表达式模式匹配千分位格式的数字和百分比
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
# pattern1 = r"\d+(.\d+)+%?"
# # 使用 re.search 函数查找匹配项
# match = re.search(pattern1, text)
# # 如果找到匹配项,返回 True否则返回 False
# return bool(match)
# def get_clean_text(text):
# import re
# pattern = r"\[^)]*?\"
# matches = re.findall(pattern, text)
# for match in matches:
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
# month_keywords_found = re.search(r"归属于|扣非", match)
# if not month_keywords_found:
# # 如果包含,则从文本中删除该部分
# text = re.sub(pattern,"", text)
# else:
# # 如果不包含,删除所有标点符号和中文数字
# text = re.sub(r"[^\w\s]", "", text)
# print(text)
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
# # #通过向量查询指标
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
# # #指标归一化处理
# db_service.update_ori_measure(conn,cursor,file_id)
# def print_measure_data(cursor,client):
# select_query = '''
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
# '''
# cursor.execute(select_query)
# records = cursor.fetchall()
# for record in records:
# ori_measure_name = record[0]
# measure_name = record[1]
# ori_measure_id = record[2]
# measure_vector = redis_service.read_from_redis(ori_measure_id)
# measure_list = ast.literal_eval(measure_vector)
# data = [measure_list]
# res = client.search(
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=2, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name","measure_value","table_num","table_index"],
# filter = 'file_id == "64"'
# )
# vector_str = measure_name+":"+ori_measure_name
# # Convert the output to a formatted JSON string
# for i in range(len(res[0])):
# vector_distance = float(res[0][i]["distance"])
# vector_measure_name = res[0][i]["entity"]["measure_name"]
# measure_value = res[0][i]["entity"]["measure_value"]
# table_num = res[0][i]["entity"]["table_num"]
# table_index = res[0][i]["entity"]["table_index"]
# table_num_list = [106]
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
# # if vector_distance > 0.89 and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
# # if vector_distance > distance and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
if __name__ == "__main__":
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
# print(vector)
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# measure_config_to_db(conn,cursor)
insert_measure_vector(conn,cursor)
# cursor.close()
# conn.close()
# import re
# text = '减少11.04百分点'
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
# print('找到了单位。')
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
# match = unit_pattern.search(text)
# print(len(match.groups()))
# if match:
# print(f'找到单位。')
# else:
# print(f'没有找到单位。')
# row1 = ['比例','比率','占比','费用']
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
# for i in range(len(row1)):
# for j in range(len(row2)):
# print(f"{row1[i]}{row2[j]}")
# import os,re
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
# # 获取文件名和扩展名
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
# file_base_name = re.sub(r'\d+', '', file_base_name)
# print(f'文件名: {file_base_name}')
# import re
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
# import threading
# # 创建一个ThreadLocal变量
# local_data = threading.local()
# # 定义一个线程执行的工作函数
# def worker():
# # 为当前线程的ThreadLocal变量设置一个值
# local_data.data = f"Thread {threading.current_thread().name}'s data"
# print(local_data.data)
# # 创建并启动多个线程
# threads = []
# for i in range(3):
# thread = threading.Thread(target=worker)
# thread.start()
# threads.append(thread)
# # 等待所有线程完成
# for thread in threads:
# thread.join()
# for i in range(2,5):
# print(i)
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
# client = MilvusClient(
# uri= MILVUS_CLIENT
# )
# conn = mysql.connector.connect(
# host=MYSQL_HOST,
# user=MYSQL_USER,
# password=MYSQL_PASSWORD,
# database=MYSQL_DB
# )
# cursor = conn.cursor()
# print_measure_data(cursor,client)
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
# redis_service.read_from_redis()
# parent_table_pages = []
# file_id = '67'
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
# db_service.update_ori_measure(conn,cursor,file_id)
# main.get_table_measure(path,'all',file_id)
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
# measure_config_to_db(conn,cursor)
# params = ['f_102','f_103',]
# for param in params:
# globals()[param] = param.replace('f_','')
# # insert_measure_vector(conn,cursor)
# print(globals()['f_102'])
# db_service.update_ori_measure(conn,cursor,file_id)
# conn.commit()
# cursor.close()
# conn.close()
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
# count_range_parts = utils.get_range(2300)
# print(count_range_parts)