pdf_code/zzb_data_word/test_db.py

from datetime import datetime
import re,os,json
import utils
import ast
import time
import redis_service
from multiprocessing import Process
from config_p import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD,MEASURE_COUNT
from pymilvus import MilvusClient
import mysql.connector
import threading
import redis
measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发"]

def insert_table_from_vector_mul_process(file_id):

    client = MilvusClient(
        uri=MILVUS_CLIENT
    )

    conn = mysql.connector.connect(
        host = MYSQL_HOST,
        user = MYSQL_USER,
        password = MYSQL_PASSWORD,
        database = MYSQL_DB
    )

    redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)

    # 创建一个cursor对象来执行SQL语句
    cursor = conn.cursor(buffered=True)

    select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
    cursor.execute(select_year_select)
    record_select = cursor.fetchall()
    report_type = record_select[0][0]
    report_year = record_select[0][1]

    select_query_half_year = '''
                SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year
                where year = '{year}'
                and measure_name  rlike '货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本'
                '''.format(year=report_year)

    cursor.execute(select_query_half_year)
    records = cursor.fetchall()
    record_range=f'0-{len(records)}'

    check_query = '''
                select id from ori_measure_list
                WHERE file_id = %s and measure_name = %s and page_number = %s and table_index = %s and ori_measure_value = %s
                '''

    #获取表格上方文字黑名单关键词的页码和表格下标
    select_table_index_query = '''
                select distinct content from measure_parser_info WHERE file_id = '{file_id}' and type='table_index'
                '''.format(file_id=file_id)


    #表格上方文字黑名单关键词的页码和表格下标转成数组
    table_index_array = []
    cursor.execute(select_table_index_query)
    table_index_records = cursor.fetchall()
    for table_index_record in table_index_records:
        table_index_array.append(table_index_record[0])

    record_start = record_range.split('-')[0]
    record_end = record_range.split('-')[1]
    try:
        for index in range(int(record_start),int(record_end)):
            record = records[index]
            ori_measure_name = record[0]
            measure_name = record[1]
            ori_measure_id = record[3]
            measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id)
            measure_list = ast.literal_eval(measure_vector)
            data = [measure_list]
            # data.append(measure_list)
            filter_str = 'file_id == "'+file_id+'"'
            res = client.search(
                collection_name="pdf_measure_v4", # Replace with the actual name of your collection
                # Replace with your query vector
                data=data,
                limit=5, # Max. number of search results to return
                search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
                output_fields=["measure_name","measure_value","table_num","table_index","measure_unit"],
                filter=filter_str
            )

            # Convert the output to a formatted JSON string
            # for i in range(len(res[0])):
            for i in range(len(res[0])):

                vector_distance = float(res[0][i]["distance"])
                pdf_measure = res[0][i]["entity"]["measure_name"]
                measure_value = res[0][i]["entity"]["measure_value"]
                table_num = res[0][i]["entity"]["table_num"]
                table_index = res[0][i]["entity"]["table_index"]
                unit = res[0][i]["entity"]["measure_unit"]

                #先过滤页码为0的情况，暂时不知道原因
                if table_num == 0:
                    continue

                #过滤表格上方文字黑名单关键词的页码和表格下标
                if f"{table_num}_{table_index}" in table_index_array:
                    continue
                #if f"{table_num}_{table_index}" in table_index_array and pdf_measure in ():

                #过滤指标中包含黑名单关键词
                if utils.check_pdf_measure_black_list(pdf_measure):
                    continue

                # if vector_distance > 0.80 :
                #检测规则开始
                #判断抽取指标和财报指标周期是否相同
                # ori_period = utils.get_period_type(ori_measure_name, report_year)
                # pdf_period = utils.get_period_type(pdf_measure, report_year)
                # if(ori_period != pdf_period):
                #     continue

                # #判断抽取指标和财报指标是否期初指标
                # start_ori_period = utils.get_start_period_type(ori_measure_name)
                # start_pdf_period = utils.get_start_period_type(pdf_measure)
                # if(start_ori_period != start_pdf_period):
                #     continue

                # #判断抽取指标和财报指标类型是否相同，是否都是季度
                # ori_season_type = utils.get_season_flag(ori_measure_name)
                # pdf_season_type = utils.get_season_flag(pdf_measure)
                # if(ori_season_type != pdf_season_type):
                #     continue

                # #判断是否都是扣非指标
                # ori_kf_type = utils.get_kf_flag(ori_measure_name)
                # pdf_kf_type = utils.get_kf_flag(pdf_measure)
                # if(ori_kf_type != pdf_kf_type):
                #     continue

                # #判断抽取指标和财报指标类型是否相同，是否都是百分比
                # ori_type = utils.get_percent_flag(ori_measure_name)
                # pdf_type = utils.get_percent_flag(pdf_measure)
                # if(ori_type != pdf_type):
                #     continue

                # #判断抽取指标和财报指标类型是否相同，是否都是占比同比变动类
                # ori_growth_type = utils.get_percent_growth(ori_measure_name)
                # pdf_growth_type = utils.get_percent_growth(pdf_measure)
                # if(ori_growth_type != pdf_growth_type):
                #     continue

                # #解决指标语义是比率，但值为非比率的情况
                # if ori_growth_type == '1':
                #     check_measure_value = abs(float(measure_value))
                #     if(check_measure_value > 10000):
                #         continue

                # # 判断数据库中是否有数据
                # check_query_data = (file_id, measure_name, int(table_num), int(table_index), measure_value)
                # cursor.execute(check_query, check_query_data)
                # check_records = cursor.fetchall()
                # if(len(check_records)) > 0:
                #     continue

                # #判断是否包含黑名单
                # if(utils.check_black_list(measure_name,pdf_measure)):
                #     continue

                # #判断抽取指标和财报指标类型是否都是增长类，比如同比变动为增长类
                # ori_change_type = utils.get_change_rate_flag(ori_measure_name)
                # pdf_change_type = utils.get_change_rate_flag(pdf_measure)
                # if(ori_change_type != pdf_change_type):
                #     continue

                # #处理调整前，调整前、后同时出现，如果有调整前过滤
                # if pdf_measure.find('调整前') != -1 or pdf_measure.find('重述前') != -1:
                #     continue

                # #判断指标是否报告期初
                # ori_report_start = utils.get_report_start(ori_measure_name)
                # pdf_report_start = utils.get_report_start(pdf_measure)
                # if(ori_report_start != pdf_report_start):
                #     continue

                print(f'{measure_name},{ori_measure_name},{pdf_measure},{vector_distance},{table_num},{table_index}')

    except Exception as e:
        print(e)
    finally:
        redis_client.close()
        cursor.close()
        conn.close()
        client.close()


if __name__ == '__main__':
    insert_table_from_vector_mul_process('1766')