commit a0cde740d8b84a2dc1a85e537144dfbf7a63dafd Author: zdzerg Date: Tue Sep 9 17:39:44 2025 +0800 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8e4be9c --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +logs/ +*.log +pdf/ +zzb_data_prod/nohup.out +zzb_data_prod/logs/ +zzb_data_prod/app.log diff --git a/milvus_init.py b/milvus_init.py new file mode 100644 index 0000000..3cfcf12 --- /dev/null +++ b/milvus_init.py @@ -0,0 +1,33 @@ +from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType +# 连接到 B 服务器上的 Milvus +# connections.connect(host='124.70.129.232', port='19530')# 测试服务器 +connections.connect(host='127.0.0.1', port='19530')# 测试服务器 +# # 获取集合列表 +utility.drop_collection("pdf_measure_v4") + +# 定义字段 +fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536), + FieldSchema(name="table_num", dtype=DataType.INT16), + FieldSchema(name="table_index", dtype=DataType.INT16), + FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200), + FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200), + FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200), + FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200) +] + +# 定义集合的 schema +schema = CollectionSchema(fields=fields, description="My Milvus collection") + +# 创建集合 +collection = Collection(name="pdf_measure_v4", schema=schema) + +collection = Collection("pdf_measure_v4") +index_params = { + "index_type": "IVF_FLAT", + "metric_type": "COSINE", + "params": {"nlist": 128} +} +collection.create_index(field_name="vector", index_params=index_params) +collection.load() \ No newline at end of file diff --git a/monitor_milvus.py b/monitor_milvus.py new file mode 100644 index 0000000..5ac7859 --- /dev/null +++ b/monitor_milvus.py @@ -0,0 +1,125 @@ +import socket +import subprocess +import time +from datetime import datetime +import os +import mysql.connector +from zzb_data_prod.config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB + +def get_time(): + return datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + +def check_port(host, port): + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(5) + result = sock.connect_ex((host, port)) + sock.close() + return result == 0 # 返回布尔值,表示端口是否可用 + except Exception as e: + print(f"[{get_time()}] 端口检测异常: {str(e)}") + return False + + +def restart_service(): + try: + subprocess.run(["bash", "/root/docker/milvus/standalone_embed.sh", "restart"]) + print(f"[{get_time()}] milvus服务重启成功") + return True + except subprocess.CalledProcessError as e: + print(f"[{get_time()}] 服务重启失败: {str(e)}") + return False + + +def start_application_process(): + """启动8000端口对应的应用进程""" + try: + # 先尝试停止可能存在的旧进程 + time.sleep(2) # 给进程停止的时间 + # 进入应用目录 + # 启动新进程 + subprocess.run( + ["bash", "/root/pdf_parser/restart_app.sh"], + ) + print(f"[{get_time()}] 应用进程(8000端口)已成功启动") + return True + + except Exception as e: + print(f"[{get_time()}] 启动应用进程失败: {str(e)}") + return False + + +def get_local_ip(): + try: + # 创建一个 UDP 套接字 + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + # 连接到一个外部地址(这里使用 Google 的公共 DNS 服务器) + s.connect(("8.8.8.8", 80)) + # 获取本地套接字的 IP 地址 + local_ip = s.getsockname()[0] + except Exception as e: + print(f"[{get_time()}] 获取内网 IP 失败: {e}") + local_ip = "127.0.0.1" # 如果失败,返回本地回环地址 + finally: + s.close() # 关闭套接字 + return local_ip + +def monitor_port_8000(): + """监控8000端口,如果异常则启动应用进程""" + print(f"[{get_time()}] 检查8000端口状态...") + port_available = check_port("127.0.0.1", 8000) + + if not port_available: + print(f"[{get_time()}] 检测到8000端口异常,尝试启动应用进程...") + success = start_application_process() + + if success: + # 启动后检查是否成功 + time.sleep(10) # 等待应用启动 + if check_port("127.0.0.1", 8000): + print(f"[{get_time()}] 应用进程启动成功,8000端口已正常") + # INSERT_YOUR_CODE + # 检查并修改数据库字段 + try: + + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, # 请替换为实际密码 + database=MYSQL_DB # 请替换为实际数据库名 + ) + cursor = conn.cursor() + local_ip = get_local_ip() + sql = f"update model_ip set status = 0 where ip = '{local_ip}:8000';" + print(f"[{get_time()}] 执行sql: {sql}") + cursor.execute(sql) + conn.commit() + print(f"[{get_time()}] 数据库字段已成功修改") + except Exception as e: + print(f"[{get_time()}] 修改数据库字段失败: {str(e)}") + finally: + try: + cursor.close() + conn.close() + except: + pass + else: + print(f"[{get_time()}] 应用进程启动后,8000端口仍未正常") + else: + print(f"[{get_time()}] 8000端口状态正常") + + + +if __name__ == '__main__': + print(f"[{get_time()}] 启动Milvus监控服务") + port_ok = check_port("127.0.0.1", 19530) + if not port_ok: + print("检测到Milvus服务异常,尝试重启...") + restart_service() + + print(f"[{get_time()}] 启动 8000 端口监控服务") + # 开始监控8000端口,每60秒检查一次 + monitor_port_8000() + + diff --git a/restart_app.sh b/restart_app.sh new file mode 100755 index 0000000..a84fce0 --- /dev/null +++ b/restart_app.sh @@ -0,0 +1,76 @@ +#!/bin/bash + + + +# 切换到 /root/docker/milvus 目录 +cd /root/docker/milvus || { echo "无法进入目录 /root/docker/milvus"; exit 1; } +# 运行 standalone_embed.sh restart 指令 +bash standalone_embed.sh restart + + +#!/bin/bash + +# 目标目录(根据实际路径修改) +TARGET_DIR="/root/pdf_parser/pdf" +LOG_FILE="/root/pdf_parser/logs/pdf_clean.log" + +# 创建日志目录 +mkdir -p "$(dirname "$LOG_FILE")" + +# 带时间戳的日志函数 +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +# 检查目标目录是否存在 +if [ ! -d "$TARGET_DIR" ]; then + log "错误:目标目录不存在 $TARGET_DIR" + exit 1 +fi + +# 执行清理并记录 +log "开始清理PDF文件..." +find "$TARGET_DIR" -iname "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do + log "删除文件: $file" + rm -f "$file" +done + +log "清理完成,共删除 $(find "$TARGET_DIR" -iname "*.pdf" | wc -l) 个残留文件" + +# 设置工作目录和日志路径 +WORK_DIR="/root/pdf_parser/zzb_data_prod" +LOG_FILE="$WORK_DIR/app.log" + +# 终止现有进程 +pids=$(ps -ef | grep app.py | grep -v grep | awk '{print $2}') +if [ -n "$pids" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] 正在停止现有进程: $pids" + kill -9 $pids +else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] 未找到正在运行的进程" +fi + +# 进入工作目录 +cd $WORK_DIR || { echo "无法进入目录 $WORK_DIR"; exit 1; } + +# 启动服务 +echo "[$(date '+%Y-%m-%d %H:%M:%S')] 启动服务..." +nohup python3 app.py > $LOG_FILE 2>&1 & + +# 等待进程启动 +sleep 2 + +# 检查进程状态 +new_pid=$(ps -ef | grep app.py | grep -v grep | awk '{print $2}') +if [ -n "$new_pid" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] 服务启动成功,进程ID: $new_pid" + echo "--------------------------------" + tail -n 10 $LOG_FILE +else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] 服务启动失败!" + echo "--------------------------------" + cat $LOG_FILE + exit 1 +fi + + diff --git a/zzb_data_prod b/zzb_data_prod new file mode 160000 index 0000000..0fcb3b0 --- /dev/null +++ b/zzb_data_prod @@ -0,0 +1 @@ +Subproject commit 0fcb3b0383d5870477694d6ea999c5f9f535f3d8 diff --git a/zzb_data_word/Mil_unit.py b/zzb_data_word/Mil_unit.py new file mode 100644 index 0000000..130243a --- /dev/null +++ b/zzb_data_word/Mil_unit.py @@ -0,0 +1,30 @@ +from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient +from config import MILVUS_CLIENT +import time +from datetime import datetime, timedelta + +def create_partition_by_hour(current_hour): + # 连接到 Milvus 服务器 + connections.connect("default",uri=MILVUS_CLIENT) + # 获取集合 + collection_name = "pdf_measure_v4" + collection = Collection(collection_name) + + # 创建当前小时的分区 + partition_name = f"partition_{current_hour}" + if not collection.has_partition(partition_name): + collection.create_partition(partition_name) + print(f"Created partition: {partition_name}") + partition = collection.partition(partition_name) + partition.load() + + # 获取所有分区 + partitions = collection.partitions + # 删除所有分区(除了默认分区和当前分区) + for partition in partitions: + name = partition.name + if name not in ["_default", partition_name]: # 保留默认分区 + pre_partition = collection.partition(name) + pre_partition.release() + collection.drop_partition(name) + print(f"Partition '{name}' deleted.") \ No newline at end of file diff --git a/zzb_data_word/__pycache__/config.cpython-310.pyc b/zzb_data_word/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000..49f15b4 Binary files /dev/null and b/zzb_data_word/__pycache__/config.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/db_service_word.cpython-310.pyc b/zzb_data_word/__pycache__/db_service_word.cpython-310.pyc new file mode 100644 index 0000000..95849e4 Binary files /dev/null and b/zzb_data_word/__pycache__/db_service_word.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/main_word.cpython-310.pyc b/zzb_data_word/__pycache__/main_word.cpython-310.pyc new file mode 100644 index 0000000..40217a6 Binary files /dev/null and b/zzb_data_word/__pycache__/main_word.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/parse_word.cpython-310.pyc b/zzb_data_word/__pycache__/parse_word.cpython-310.pyc new file mode 100644 index 0000000..a58b0dc Binary files /dev/null and b/zzb_data_word/__pycache__/parse_word.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/redis_service.cpython-310.pyc b/zzb_data_word/__pycache__/redis_service.cpython-310.pyc new file mode 100644 index 0000000..7365bdd Binary files /dev/null and b/zzb_data_word/__pycache__/redis_service.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/utils.cpython-310.pyc b/zzb_data_word/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000..0e38ff3 Binary files /dev/null and b/zzb_data_word/__pycache__/utils.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/word_title.cpython-310.pyc b/zzb_data_word/__pycache__/word_title.cpython-310.pyc new file mode 100644 index 0000000..d8f248f Binary files /dev/null and b/zzb_data_word/__pycache__/word_title.cpython-310.pyc differ diff --git a/zzb_data_word/__pycache__/zzb_logger.cpython-310.pyc b/zzb_data_word/__pycache__/zzb_logger.cpython-310.pyc new file mode 100644 index 0000000..5eb8089 Binary files /dev/null and b/zzb_data_word/__pycache__/zzb_logger.cpython-310.pyc differ diff --git a/zzb_data_word/app_word.py b/zzb_data_word/app_word.py new file mode 100644 index 0000000..e71e1a2 --- /dev/null +++ b/zzb_data_word/app_word.py @@ -0,0 +1,225 @@ +from fastapi import FastAPI +from pydantic import BaseModel +import os +import utils +import queue +from multiprocessing import Process +import word_title +import time +import config +import requests +import threading +from parse_word import parse_docx, split_text_table +import json +import db_service_word +import main_word +from zzb_logger import applog + + +app = FastAPI() +cpu_count = os.cpu_count() +job_queue = queue.Queue() + +# 定义请求体模型 +class FileItem(BaseModel): + file_path: str + file_id: str + +def split_list(lst, n): + k, m = divmod(len(lst), n) + return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)] + +def run_job(): + #判断是否有任务在执行 + if_run = True + + if job_queue.empty(): + applog.info(f"job_queue为空:") + if_run = False + + if if_run: + job_config = job_queue.get() + file_path = job_config['file_path'] + file_id = job_config['file_id'] + continue_execution = True + try: + + start_time = time.time() + applog.info(f"开始启动文件解析任务: {file_path}") + if file_path.startswith('http'): + file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH) + try: + time_dispatch_job = time.time() + # 通知开始解析 暂时不通知 + response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5}) + applog.info(f'通知pdf开始解析url:{file_id}:{response.url}') + applog.info(f'通知pdf开始解析状态:{file_id}:{response.text}') + parsed_content, catalog_content = parse_docx(file_path) + + json_parsed_content = json.loads(parsed_content) + json_catalog_content = json.loads(catalog_content) + + db_service_word.word_title_insert_mysql(file_id, json_catalog_content) + + parent_table_pages = word_title.get_parent_table_pages(json_catalog_content,file_id) + + text_elements_json, table_elements_json = split_text_table(json_parsed_content) + # + processes = [] + text_list = split_list(json.loads(text_elements_json), cpu_count) + applog.info(f'text,任务ID:{file_id}') + for job_info in text_list: + p = Process(target=main_word.process_text_content, args=(file_id, job_info,json.loads(table_elements_json),json.loads(text_elements_json))) + processes.append(p) + p.start() + applog.info(f'等待所有子任务完成,任务ID:{file_id}') + for p in processes: + p.join() + applog.info(f'word表格中 text解析完成,任务ID:{file_id}',) + + processes = [] + table_list = split_list(json.loads(table_elements_json), cpu_count) + applog.info(f'开始解析word表表格中的table,任务ID:{file_id}') + for job_info in table_list: + p = Process(target=main_word.process_table, args=(file_id, job_info,)) + processes.append(p) + p.start() + applog.info(f'等待所有子任务完成,任务ID:{file_id}' ) + for p in processes: + p.join() + + # main_word.process_table(file_id, json.loads(table_elements_json)) + applog.info(f'word表格中 table解析完成,任务ID:{file_id}') + + + time_dispatch_job_end = time.time() + process_time = time_dispatch_job_end - time_dispatch_job + db_service_word.process_time(file_id, '1', process_time, time_dispatch_job, time_dispatch_job_end) + parser_end_time = time.time() + applog.info(f"解析任务 {file_id} 完成,耗时{(parser_end_time - time_dispatch_job):.2f} 秒。") + + except Exception as e: + response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7}) + applog.info(f'通知任务状态url:{file_id}:{response.url}') + applog.info(f'通知任务状态任务:{file_id}:{response.text}') + applog.info(f"{file_id}运行失败: {e}") + continue_execution = False + if continue_execution : + #这里做一步判断,看看是否还要继续。 + if db_service_word.file_type_check(file_id): + applog.info("文本较真表格生成已结束") + else: + # 通知抽取指标--------------------------------- + response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6}) + applog.info(f'通知开始抽取指标url:{file_id}:{response.url}') + applog.info(f'通知开始抽取指标状态:{file_id}:{response.text}') + + parser_start_time = time.time() + applog.info(f'开始表格指标抽取,任务ID:{file_id}') + time_start = time.time() + if db_service_word.file_type_check_v2(file_id) == 3 : #判断是否为3季报 + main_word.start_table_measure_job(file_id) + #time_start_end = time.time() + #process_time = time_start_end - time_start + #db_service.process_time(file_id,'2',process_time) + time_start_end = time.time() + process_time = time_start_end - time_start + db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end) + applog.info(f'表格指标抽取完成,任务ID:{file_id}') + parser_end_time = time.time() + applog.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。") + + applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}') + time_update = time.time() + main_word.update_measure_data(file_id,file_path,parent_table_pages) + #time_update_end = time.time() + #process_time = time_update_end - time_update + #db_service.process_time(file_id,'3',process_time) + applog.info(f'归一化完成任务ID:{file_id}') + end_time = time.time() + applog.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。") + time_update_end = time.time() + process_time = time_update_end - time_update + db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end) + else:#不是三季报就直接按照年报和半年报走 + main_word.start_table_measure_job(file_id) + #time_start_end = time.time() + #process_time = time_start_end - time_start + #db_service.process_time(file_id,'2',process_time) + time_start_end = time.time() + process_time = time_start_end - time_start + db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end) + applog.info(f'表格指标抽取完成,任务ID:{file_id}' ) + parser_end_time = time.time() + applog.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。") + + applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}' ) + time_update = time.time() + main_word.update_measure_data(file_id,file_path,parent_table_pages) + #time_update_end = time.time() + #process_time = time_update_end - time_update + #db_service.process_time(file_id,'3',process_time) + applog.info(f'归一化完成任务ID:{file_id}') + end_time = time.time() + applog.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。") + time_update_end = time.time() + process_time = time_update_end - time_update + db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end) + #通知任务完成 + response_time = time.time() + + response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1}) + applog.info(f'通知任务状态url:{file_id}:{response.url}') + applog.info(f'通知任务状态任务:{file_id}:{response.text}') + + response_time_end = time.time() + process_time = response_time_end - response_time + db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end) + except Exception as e: + #通知任务完成 + response_time = time.time() + response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4}) + response_time_end = time.time() + process_time = response_time_end - response_time + db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end) + applog.info(f'通知任务状态url:{file_id}:{response.url}') + applog.info(f'通知任务状态任务:{file_id}:{response.text}') + applog.info(f"Response status code: {response.status_code}") + applog.info(f"{file_id}运行失败: {e}") + finally: + applog.info(f"任务 {file_id} 完成") + + else: + applog.info("有任务运行中,需要等待.....") + +def parse_route(fileItem: FileItem): + # 创建一个队列,保证每次只执行一个文件解析任务 + job_queue.put({ + 'file_path' : fileItem.file_path, + 'file_id' : fileItem.file_id, + # 'type': fileItem.type + }) + applog.info(f"增加 {fileItem.file_id} 到队列.") + threading.Thread(target=run_job, args=()).start() + + return {"success": True, "msg": "文件解析开始"} + +app.post("/parser/start", + tags=["parser"], + summary="解析Pdf文件", + )(parse_route) + +# 运行 FastAPI 应用 +if __name__ == "__main__": + # 服务器启动服务 + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=config.PORT) + # 本地调试任务 + # file_id = "201837" + # job_queue.put({ + # 'file_path': '西部建设.docx', + # 'file_id': file_id, + # }) + # db_service_word.delete_database(file_id) + # run_job() diff --git a/zzb_data_word/config.py b/zzb_data_word/config.py new file mode 100644 index 0000000..1a3d2da --- /dev/null +++ b/zzb_data_word/config.py @@ -0,0 +1,23 @@ +MILVUS_CLIENT='http://127.0.0.1:19530' +MILVUS_HOST = '127.0.0.1' +MILVUS_PORT = 19530 +MYSQL_HOST = '10.127.2.207' +MYSQL_PORT = 3306 +MYSQL_USER = 'financial_prod' +MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV' +MYSQL_DB = 'financial_report_prod' +NOTIFY_ADDR = 'http://10.127.2.202:8100/api/tenant/report/notify' +FILE_PATH = '/root/pdf_parser/word/' +REDIS_HOST = '10.127.2.209' +REDIS_PORT = 6379 +REDIS_PASSWORD = 'dMrt4kmwiW6LDJXy' +PORT = 8001 +MEASURE_COUNT = 8 + + +MYSQL_HOST_APP = '10.127.2.207' +MYSQL_PORT_APP = 3306 +MYSQL_USER_APP = 'financial_prod' +MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV' +MYSQL_DB_APP = 'financial_report_prod' +api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d' diff --git a/zzb_data_word/db_service_word.py b/zzb_data_word/db_service_word.py new file mode 100644 index 0000000..35b7814 --- /dev/null +++ b/zzb_data_word/db_service_word.py @@ -0,0 +1,1054 @@ +from datetime import datetime +import re,os,json +import utils +import ast +import time +import redis_service +from multiprocessing import Process +from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP +from pymilvus import MilvusClient +import mysql.connector +import redis +from zzb_logger import applog +measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发","货币资金","应收账款","存货","固定资产","在建工程","商誉","短期借款","应付账款","合同负债","长期借款","营业成本"] +# 解析大模型抽取的指标,并插入到数据库 +def parse_llm_measure_to_db(measure_info,type,conn,cursor): + + create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + check_query = ''' + select id from ori_measure_list + WHERE file_id = %s and type = %s and page_number = %s and ori_measure_value = %s + ''' + # 执行SQL语句,插入数据 + insert_query = ''' + INSERT INTO ori_measure_list + (file_id, file_name, type, page_number, table_index, ori_measure_id, ori_measure_name, ori_measure_value, create_time, update_time) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ''' + file_id = measure_info['file_id'] + file_name = measure_info['path'] + llm_measure = measure_info['llm_measure'] + page_num = measure_info['page_num'] + table_index = '0' + if type == 'table': + table_index = measure_info['table_index'] + for measure_obj in llm_measure: + measure_obj = measure_obj.replace('\n', '').replace('\r', '').replace(' ', '').replace(':', ':') + if ':' in measure_obj: + ori_measure_name = measure_obj.split(':')[0].replace('-', '') + if len(ori_measure_name) > 30 : + continue + ori_measure_value = measure_obj.split(':')[1].replace('+', '').replace(',', '').replace('元', '').replace('%', '') + if '-' in ori_measure_value: + ori_measure_value = "-" + if '.' in ori_measure_name: + ori_measure_name = ori_measure_name.split('.')[1] + ori_measure_id = utils.get_md5(ori_measure_name) + if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', ori_measure_value): + # 判断数据库中是否有数据 + check_query_data = (file_id, 'text', int(page_num), ori_measure_value) + cursor.execute(check_query, check_query_data) + check_records = cursor.fetchall() + if(len(check_records)) > 0: + continue + data_to_insert = (file_id, file_name, type, int(page_num), int(table_index), ori_measure_id, ori_measure_name, ori_measure_value, create_time, create_time) + cursor.execute(insert_query, data_to_insert) + conn.commit() + +def insert_measure_parser_info(parser_info,conn,cursor): + create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # 执行SQL语句,插入数据 + insert_query = ''' + INSERT INTO measure_parser_info + (file_id, type, content, create_time) + VALUES (%s, %s, %s, %s) + ''' + file_id = parser_info['file_id'] + type = parser_info['type'] + content = parser_info['content'] + data_to_insert = (file_id, type, content, create_time) + cursor.execute(insert_query, data_to_insert) + conn.commit() +def insert_measure_parser_info_measure(parser_info, conn, cursor, line_text): + create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + insert_query = ''' + INSERT INTO word_measure_parser_info_linetext + (file_id, type, content,text, create_time) + VALUES (%s, %s, %s, %s,%s) + ''' + file_id = parser_info['file_id'] + type = parser_info['type'] + content = parser_info['content'] + text = line_text + data_to_insert = (file_id, type, content,text, create_time) + cursor.execute(insert_query, data_to_insert) + conn.commit() +def insert_table_unit_info(table_info,conn,cursor): + + # 执行SQL语句,插入数据 + insert_query = ''' + INSERT INTO table_unit_info + (file_id, page_num, table_index, unit) + VALUES (%s, %s, %s, %s) + ''' + file_id = table_info['file_id'] + page_num = int(table_info['page_num']) + table_index = int(table_info['table_index']) + unit = table_info['unit'] + data_to_insert = (file_id, page_num, table_index, unit) + cursor.execute(insert_query, data_to_insert) + conn.commit() + +def insert_table_unit_info_v1(table_info, conn, cursor): + """ + 插入数据到 table_unit_info 表之前,检查是否存在相同的 file_id, page_num 和 table_index。 + 如果存在且 unit 不同,更新现有记录,否则插入新记录。 + """ + + file_id = table_info['file_id'] + page_num = int(table_info['page_num']) + table_index = int(table_info['table_index']) + unit = table_info['unit'] + + # 查询现有记录 + check_query = ''' + SELECT unit + FROM table_unit_info + WHERE file_id = %s AND page_num = %s AND table_index = %s + ''' + cursor.execute(check_query, (file_id, page_num, table_index)) + existing_record = cursor.fetchone() + + if existing_record: + existing_unit = existing_record[0] + + if unit != existing_unit: + # 更新现有记录 + update_query = ''' + UPDATE table_unit_info + SET unit = %s + WHERE file_id = %s AND page_num = %s AND table_index = %s + ''' + cursor.execute(update_query, (unit, file_id, page_num, table_index)) + + else: + applog.info(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.') + else: + # 插入新的记录 + insert_query = ''' + INSERT INTO table_unit_info + (file_id, page_num, table_index, unit) + VALUES (%s, %s, %s, %s) + ''' + data_to_insert = (file_id, page_num, table_index, unit) + cursor.execute(insert_query, data_to_insert) + + conn.commit() + +def insert_table_text_info(table_info,conn,cursor): + + # 执行SQL语句,插入数据 + insert_query = ''' + INSERT INTO table_text_info + (file_id, page_num, table_index, text) + VALUES (%s, %s, %s, %s) + ''' + file_id = table_info['file_id'] + page_num = int(table_info['page_num']) + table_index = int(table_info['table_index']) + text = table_info['text_info'] + data_to_insert = (file_id, page_num, table_index, text) + cursor.execute(insert_query, data_to_insert) + conn.commit() + +def update_ori_measure(conn,cursor,file_id): + + select_year_select = f"""select report_type,year from report_check where id = {file_id}""" + cursor.execute(select_year_select) + record_select = cursor.fetchall() + report_type = record_select[0][0] + report_year = record_select[0][1] + + # 执行SQL语句,更新数据 + update_query = ''' + UPDATE ori_measure_list + SET measure_id = %s, measure_name = %s + WHERE ori_measure_id = %s and file_id = %s + ''' + + select_query = ''' + SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id + FROM ori_measure_list t1 + left join + measure_config t2 + on t1.ori_measure_id = t2.ori_measure_id + where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='') + and t1.file_id = '{file_id}' + and t2.year = '{year}' + '''.format(file_id=file_id, year=report_year) + select_query_half_year = ''' + SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id + FROM ori_measure_list t1 + left join + measure_config_half_year t2 + on t1.ori_measure_id = t2.ori_measure_id + where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='') + and t1.file_id = '{file_id}' + and t2.year = '{year}' + '''.format(file_id=file_id, year=report_year) + select_query_thrid = ''' + SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id + FROM ori_measure_list t1 + left join + measure_config_third_quarter t2 + on t1.ori_measure_id = t2.ori_measure_id + where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='') + and t1.file_id = '{file_id}' + and t2.year = '{year}' + '''.format(file_id=file_id, year=report_year) + + select_query_first_quarter = ''' + SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id + FROM ori_measure_list t1 + left join + measure_config_first_quarter t2 + on t1.ori_measure_id = t2.ori_measure_id + where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='') + and t1.file_id = '{file_id}' + and t2.year = '{year}' + '''.format(file_id=file_id, year=report_year) + + if report_type == 1: + start_time = time.time() + cursor.execute(select_query_half_year) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。") + applog.info(f'update_ori_measure方法走的是半年报') + elif report_type == 2: + start_time = time.time() + cursor.execute(select_query_first_quarter) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。") + applog.info(f'update_ori_measure方法走的是一季报') + elif report_type == 3: + start_time = time.time() + cursor.execute(select_query_thrid) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。") + applog.info(f'update_ori_measure方法走的是三季报') + else: + start_time = time.time() + cursor.execute(select_query) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。") + applog.info(f'update_ori_measure方法走的是全年报') + start_time = time.time() + for record in records: + data_to_update = (record[0], record[1], record[2], file_id) + cursor.execute(update_query, data_to_update) + conn.commit() + end_time = time.time() + applog.info(f"更新数据更新 {(end_time - start_time):.2f} 秒。") + #更新measure_list表,增加此次文件的显示指标 + start_time = time.time() + create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + if report_type == 0: + table_name = "measure_config" + elif report_type == 2: + table_name = "measure_config_first_quarter" + + elif report_type == 3: + table_name = "measure_config_third_quarter" + else: + table_name = "measure_config_half_year" + + insert_query = f''' + INSERT INTO measure_list + (measure_id, measure_name, create_time, update_time, file_id) + select distinct measure_id,measure_name, %s,%s,%s from {table_name} + where year = {report_year} + ''' + + data_to_update = (create_time, create_time, file_id) + cursor.execute(insert_query, data_to_update) + conn.commit() + end_time = time.time() + applog.info(f"更新数据写入 {(end_time - start_time):.2f} 秒。") + +def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,records,record_range,black_array): + create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + applog.info('Run task %s (%s)...' % (record_range, os.getpid())) + applog.info(f"插入数据 {len(records)}") + client = MilvusClient( + uri=MILVUS_CLIENT + ) + + conn = mysql.connector.connect( + host = MYSQL_HOST, + user = MYSQL_USER, + password = MYSQL_PASSWORD, + database = MYSQL_DB + ) + + redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) + + + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + conn_app = mysql.connector.connect( + host = MYSQL_HOST_APP, + user = MYSQL_USER_APP, + password = MYSQL_PASSWORD_APP, + database = MYSQL_DB_APP + ) + cursor_app = conn_app.cursor(buffered=True) + + select_year_select = f"""select report_type,year from report_check where id = {file_id}""" + cursor.execute(select_year_select) + record_select = cursor.fetchall() + report_type = record_select[0][0] + report_year = record_select[0][1] + + check_query = ''' + select id from ori_measure_list + WHERE file_id = %s and measure_name = %s and page_number = %s and table_index = %s and ori_measure_value = %s + ''' + insert_query = ''' + INSERT INTO ori_measure_list + (file_id, file_name, type, page_number, table_index, ori_measure_id, ori_measure_name, ori_measure_value, create_time, update_time, distance, pdf_measure,measure_id,measure_name,unit) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ''' + #获取表格上方文字包含母公司字样的文本index + select_parent_query = ''' + select distinct content from measure_parser_info WHERE file_id = '{file_id}' and type='parent_com' + '''.format(file_id=file_id) + + #获取表格上方文字黑名单关键词的页码和表格下标 + select_table_index_query = ''' + select distinct content from measure_parser_info WHERE file_id = '{file_id}' and type='table_index' + '''.format(file_id=file_id) + # #获取表格上方文字黑名单关键词的页码和表格下标----标题下的详细指标 + select_measure_index_query = ''' + SELECT content FROM measure_parser_info_linetext WHERE file_id = %s AND type = 'measure_index' + ''' + unit_query = ''' + select unit from table_unit_info + WHERE file_id = %s and page_num = %s and table_index = %s + ''' + + cursor_app.execute(select_parent_query) + parent_records = cursor_app.fetchall() + + for parent_record in parent_records: + parent_id = parent_record[0] + parent_table_pages.append(int(parent_id)) + + #表格上方文字黑名单关键词的页码和表格下标转成数组 + table_index_array = [] + cursor_app.execute(select_table_index_query) + table_index_records = cursor_app.fetchall() + for table_index_record in table_index_records: + table_index_array.append(table_index_record[0]) + # #仿照写法,指标的黑名单转化 + measure_index_array = [] + cursor_app.execute(select_measure_index_query, (file_id,)) + measure_index_records = cursor_app.fetchall() + for measure_index_record in measure_index_records: + measure_index_array.append(measure_index_record[0]) + + + if str(report_type) == "2": + parent_table_pages = [] + table_index_array = [] + measure_index_array = [] + applog.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}') + applog.info(f"black_array:{black_array}") + + record_start = record_range.split('-')[0] + record_end = record_range.split('-')[1] + try: + for index in range(int(record_start),int(record_end)): + record = records[index] + ori_measure_name = record[0] + measure_name = record[1] + distance = record[2] + ori_measure_id = record[3] + measure_id = record[4] + measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id) + measure_list = ast.literal_eval(measure_vector) + data = [measure_list] + # data.append(measure_list) + filter_str = 'file_id == "'+file_id+'"' + res = client.search( + collection_name="pdf_measure_v4", # Replace with the actual name of your collection + # Replace with your query vector + data=data, + limit=3, # Max. number of search results to return + search_params={"metric_type": "COSINE", "params": {}}, # Search parameters + output_fields=["measure_name","measure_value","table_num","table_index","measure_unit"], + filter=filter_str + ) + + + + # Convert the output to a formatted JSON string + # for i in range(len(res[0])): + for i in range(len(res[0])): + + vector_distance = float(res[0][i]["distance"]) + pdf_measure = res[0][i]["entity"]["measure_name"] + measure_value = res[0][i]["entity"]["measure_value"] + table_num = res[0][i]["entity"]["table_num"] + table_index = res[0][i]["entity"]["table_index"] + unit = res[0][i]["entity"]["measure_unit"] + + #先过滤页码为0的情况,暂时不知道原因 + if table_num == 0: + continue + + #过滤表格上方文字黑名单关键词的页码和表格下标 + if f"{table_num}" in table_index_array: + continue + + + #过滤指标中包含黑名单关键词 + if utils.check_pdf_measure_black_list(pdf_measure): + continue + + if f"{table_num}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): + #if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): + applog.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标') + continue + + + if vector_distance > distance and table_num not in parent_table_pages: + #检测规则开始 + #判断抽取指标和财报指标周期是否相同 + ori_period = utils.get_period_type(ori_measure_name, report_year) + pdf_period = utils.get_period_type(pdf_measure, report_year) + if pdf_measure == '2023年6月30日货币资金合计': + applog.info(f'第1处{ori_period}和{pdf_period}') + if(ori_period != pdf_period): + continue + + + #判断抽取指标和财报指标是否期初指标 + start_ori_period = utils.get_start_period_type(ori_measure_name) + start_pdf_period = utils.get_start_period_type(pdf_measure) + if pdf_measure == '2023年6月30日货币资金合计': + applog.info(f'第2处{start_ori_period}和{start_pdf_period}') + if(start_ori_period != start_pdf_period): + continue + + #判断抽取指标和财报指标类型是否相同,是否都是季度 + ori_season_type = utils.get_season_flag(ori_measure_name) + pdf_season_type = utils.get_season_flag(pdf_measure) + if pdf_measure == '2023年6月30日货币资金合计': + applog.info(f'第3处{ori_season_type}和{pdf_season_type}') + if(ori_season_type != pdf_season_type): + continue + + + #判断是否都是扣非指标 + ori_kf_type = utils.get_kf_flag(ori_measure_name) + pdf_kf_type = utils.get_kf_flag(pdf_measure) + if pdf_measure == '2023年6月30日货币资金合计': + applog.info(f'第4处{ori_kf_type}和{pdf_kf_type}') + if(ori_kf_type != pdf_kf_type): + applog.info(f'扣非指标{table_num}页的{pdf_measure}指标') + continue + + #判断抽取指标和财报指标类型是否相同,是否都是百分比 + ori_type = utils.get_percent_flag(ori_measure_name) + pdf_type = utils.get_percent_flag(pdf_measure) + if pdf_measure == '2023年6月30日货币资金合计': + applog.info(f'第5处{ori_type}和{pdf_type}') + if(ori_type != pdf_type): + continue + + #判断抽取指标和财报指标类型是否相同,是否都是占比同比变动类 + ori_growth_type = utils.get_percent_growth(ori_measure_name) + pdf_growth_type = utils.get_percent_growth(pdf_measure) + if pdf_measure == '2023年6月30日货币资金合计': + applog.info(f'第6处{ori_growth_type}和{pdf_growth_type}') + if(ori_growth_type != pdf_growth_type): + continue + + #解决指标语义是比率,但值为非比率的情况 + if ori_growth_type == '1': + check_measure_value = abs(float(measure_value)) + if(check_measure_value > 10000): + continue + + # 判断数据库中是否有数据 + check_query_data = (file_id, measure_name, int(table_num), int(table_index), measure_value) + cursor.execute(check_query, check_query_data) + check_records = cursor.fetchall() + if(len(check_records)) > 0: + continue + + #判断是否包含黑名单 + if(utils.check_black_list(measure_name,pdf_measure,black_array)): + continue + + if(utils.check_white_list(measure_name,pdf_measure)): + applog.info(f"measure_name{measure_name},pdf_measure{pdf_measure}") + continue + + #判断抽取指标和财报指标类型是否都是增长类,比如同比变动为增长类 + ori_change_type = utils.get_change_rate_flag(ori_measure_name) + pdf_change_type = utils.get_change_rate_flag(pdf_measure) + if(ori_change_type != pdf_change_type): + continue + + #处理调整前,调整前、后同时出现,如果有调整前过滤 + if pdf_measure.find('调整前') != -1 or pdf_measure.find('重述前') != -1: + continue + + #判断指标是否报告期初 + ori_report_start = utils.get_report_start(ori_measure_name) + pdf_report_start = utils.get_report_start(pdf_measure) + + if(ori_report_start != pdf_report_start): + continue + + #检测规则结束 + #获取指标单位数据,除了百分比 + if(utils.get_percent_flag(measure_name) == '0'): + unit_query_data = (file_id, int(table_num), int(table_index)) + cursor.execute(unit_query, unit_query_data) + unit_records = cursor.fetchall() + if unit != '' : + pass + elif unit == '' and (len(unit_records)) > 0: + unit = unit_records[0][0] + else: + unit = '元' + + data_to_insert = (file_id, file_name, "table", int(table_num), int(table_index), ori_measure_id, ori_measure_name, measure_value, create_time, create_time, vector_distance, pdf_measure,measure_id,measure_name,unit) + cursor.execute(insert_query, data_to_insert) + conn.commit() + except Exception as e: + applog.error(e) + finally: + redis_client.close() + cursor.close() + conn.close() + client.close() + +# +def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_name): + select_year_select = f"""select report_type,year from report_check where id = {file_id}""" + cursor.execute(select_year_select) + record_select = cursor.fetchall() + report_type = record_select[0][0] + report_year = record_select[0][1] + + select_query = ''' + SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config + where year = '{year}' + '''.format(year=report_year) + select_query_half_year = ''' + SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year + where year = '{year}' + '''.format(year=report_year) + select_query_thrid = ''' + SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter + where year = '{year}' + '''.format(year=report_year) + select_query_first_quarter = ''' + SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter + where year = '{year}' + '''.format(year=report_year) + # select_black_array_query = 'SELECT measure_name, keywords FROM measure_black_list where isdel = 0' + select_black_array_query = ''' + SELECT measure_name, keywords FROM measure_black_list where isdel = 0 and find_in_set('{year}',year) and find_in_set('{flag}',flag) + '''.format(year=report_year, flag=report_type) + + + black_array = [] + cursor.execute(select_black_array_query) + results = cursor.fetchall() + for row in results: + category = row[0] + keywords = row[1].split(',') + black_array.append(f"{category}:{','.join(keywords)}") + + if report_type == 1: + start_time = time.time() + cursor.execute(select_query_half_year) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") + applog.info('insert_table_measure_from_vector_async_process方法走的半年报') + start_time = time.time() + records_range_parts = utils.get_range(len(records),MEASURE_COUNT) + processes = [] + for record_range in records_range_parts: + p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,)) + processes.append(p) + p.start() + elif report_type == 2: + start_time = time.time() + cursor.execute(select_query_first_quarter) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") + applog.info('insert_table_measure_from_vector_async_process方法走的一季报') + start_time = time.time() + records_range_parts = utils.get_range(len(records),MEASURE_COUNT) + processes = [] + for record_range in records_range_parts: + p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,)) + processes.append(p) + p.start() + elif report_type == 3: + start_time = time.time() + cursor.execute(select_query_thrid) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") + applog.info('insert_table_measure_from_vector_async_process方法走的三季报') + start_time = time.time() + records_range_parts = utils.get_range(len(records),MEASURE_COUNT) + processes = [] + for record_range in records_range_parts: + p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,)) + processes.append(p) + p.start() + + else: + start_time = time.time() + cursor.execute(select_query) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") + applog.info('insert_table_measure_from_vector_async_process方法走的全年报') + start_time = time.time() + records_range_parts = utils.get_range(len(records),MEASURE_COUNT) + processes = [] + for record_range in records_range_parts: + p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,)) + processes.append(p) + p.start() + + applog.info(f'等待所有子任务完成,任务ID:{file_id}') + for p in processes: + p.join() + applog.info(f'所有子任务完成,任务ID:{file_id}') + applog.info(f'启动指标归一化任务ID:{file_id}') + end_time = time.time() + applog.info(f"向量更新时间 {(end_time - start_time):.2f} 秒。") + +def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,file_name): + create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + select_query = ''' + SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config + ''' + + check_query = ''' + select id from ori_measure_list + WHERE file_id = %s and measure_name = %s and page_number = %s and table_index = %s and ori_measure_value = %s + ''' + insert_query = ''' + INSERT INTO ori_measure_list + (file_id, file_name, type, page_number, table_index, ori_measure_id, ori_measure_name, ori_measure_value, create_time, update_time, distance, pdf_measure,measure_id,measure_name,unit) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ,%s) + ''' + select_year_select = f"""select report_type,year from report_check where id = {file_id}""" + cursor.execute(select_year_select) + record_select = cursor.fetchall() + report_type = record_select[0][0] + report_year = record_select[0][1] + + start_time = time.time() + cursor.execute(select_query) + records = cursor.fetchall() + end_time = time.time() + applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") + start_time = time.time() + + try: + for record in records: + ori_measure_name = record[0] + measure_name = record[1] + distance = record[2] + ori_measure_id = record[3] + measure_id = record[4] + measure_vector = redis_service.read_from_redis(ori_measure_id) + measure_list = ast.literal_eval(measure_vector) + data = [measure_list] + filter_str = 'file_id == "'+file_id+'"' + res = client.search( + collection_name="pdf_measure_v4", # Replace with the actual name of your collection + # Replace with your query vector + data=data, + limit=3, # Max. number of search results to return + search_params={"metric_type": "COSINE", "params": {}}, # Search parameters + output_fields=["measure_name","measure_value","table_num","table_index","measure_unit"], + filter=filter_str + ) + + # Convert the output to a formatted JSON string + for i in range(len(res[0])): + + vector_distance = float(res[0][i]["distance"]) + pdf_measure = res[0][i]["entity"]["measure_name"] + measure_value = res[0][i]["entity"]["measure_value"] + table_num = res[0][i]["entity"]["table_num"] + table_index = res[0][i]["entity"]["table_index"] + measure_unit = res[0][i]["entity"]["measure_unit"] + + if vector_distance > distance and table_num not in parent_table_pages: + #检测规则开始 + #判断抽取指标和财报指标周期是否相同 + ori_period = utils.get_period_type(ori_measure_name, report_year) + pdf_period = utils.get_period_type(pdf_measure, report_year) + if(ori_period != pdf_period): + continue + + #判断抽取指标和财报指标类型是否相同,是否都是百分比 + ori_type = utils.get_percent_flag(ori_measure_name) + pdf_type = utils.get_percent_flag(pdf_measure) + if(ori_type != pdf_type): + continue + + # 判断数据库中是否有数据 + check_query_data = (file_id, measure_name, int(table_num), int(table_index), measure_value) + cursor.execute(check_query, check_query_data) + check_records = cursor.fetchall() + if(len(check_records)) > 0: + continue + #检测规则结束 + + data_to_insert = (file_id, file_name, "table", int(table_num), int(table_index), ori_measure_id, ori_measure_name, measure_value, create_time, create_time, vector_distance, pdf_measure,measure_id,measure_name,measure_unit) + cursor.execute(insert_query, data_to_insert) + conn.commit() + except Exception as e: + applog.info(e) + end_time = time.time() + applog.info(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。") + start_time = time.time() + + +def insert_measure_data_to_milvus(client,table_info,cursor,conn): + insert_query = ''' + INSERT INTO word_measure_parse_process + (file_id, page_num, content) + VALUES (%s, %s, %s) + ''' + for table in table_info: + try: + data=[] + table_num = table['page_num'].split("_")[0] + file_id = table['file_id'] + table_index = table['page_num'].split("_")[1] + + measure_list = table['measure_list'] + for measure in measure_list: + measure_name = measure['measure_name'] + # 需要跳过的一些指标 + black_list = ["营业总成本"] + if any(black in measure_name for black in black_list): + continue + measure_value = measure['measure_value'].replace("(", "").replace(")", "") + measure_name = utils.get_clean_text(measure_name) + measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉 + #measure_name_1 = measure_name.replace('调整后','') + quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2023年','2022年','2021年','年'] + for quarter in quarters: + measure_name = measure_name.replace(quarter * 2, quarter) + pattern_dup = re.compile(r'(\w{3,})\1+')#去掉任意超过两个字且重复的字符 + matches = pattern_dup.findall(measure_name) + for match in matches: + applog.info(f"被删除的字符: {match * 2}") + measure_name = pattern_dup.sub(r'\1', measure_name) + measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末') + measure_unit = measure['measure_unit'] + if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords): + vector_obj = utils.embed_with_str(measure_name_1) + vector = vector_obj.output["embeddings"][0]["embedding"] + measure_data = {} + measure_data['vector'] = vector + measure_data['table_num'] = int(table_num) + measure_data['table_index'] = int(table_index) + measure_data['measure_name'] = measure_name + measure_data['measure_value'] = measure_value + measure_data['measure_unit'] = measure_unit + measure_data['file_id'] = file_id + data.append(measure_data) + + # 指标数据写入指标解析过程表,用于前端展示 + content = f"{measure_name}:{measure_value}" + data_to_insert = (file_id, table_num, content) + cursor.execute(insert_query, data_to_insert) + conn.commit() + elif re.match(r'(增加|减少|下降|上升)[了]?(\d+\.\d+)[个]?百分点', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords): + #特殊处理指标值为增加了/减少了 XXX 个百分点 + unit_pattern = re.compile(r'(增加|减少|下降|上升)[了]?(\d+\.\d+)[个]?百分点') + match = unit_pattern.search(measure_value) + if match and len(match.groups()) == 2: + crease_type = match.group(1) + measure_value = match.group(2) + if crease_type == '减少' or crease_type == '下降': + measure_value = f'-{match.group(2)}' + + vector_obj = utils.embed_with_str(measure_name_1) + vector = vector_obj.output["embeddings"][0]["embedding"] + measure_data = {} + measure_data['vector'] = vector + measure_data['table_num'] = int(table_num) + measure_data['table_index'] = int(table_index) + measure_data['measure_name'] = measure_name + measure_data['measure_value'] = measure_value + measure_data['measure_unit'] = measure_unit + measure_data['file_id'] = file_id + data.append(measure_data) + + # 指标数据写入指标解析过程表,用于前端展示 + content = f"{measure_name}:{measure_value}" + data_to_insert = (file_id, table_num, content) + cursor.execute(insert_query, data_to_insert) + conn.commit() + res = client.insert( + collection_name="pdf_measure_v4", + data=data + ) + + + except Exception as e: + applog.error(f"异常信息=={e}") + +def runing_job(): + conn = mysql.connector.connect( + host= MYSQL_HOST, + user= MYSQL_USER, + password= MYSQL_PASSWORD, + database= MYSQL_DB + ) + + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + select_query = ''' + SELECT * FROM report_check where status = 0 and isdel=0 + ''' + cursor.execute(select_query) + records = cursor.fetchall() + if(len(records)) > 1: + return True + return False + +def insert_word_parse_process(parser_info,conn,cursor,table_name): + # 执行SQL语句,插入数据 + insert_query = f''' + INSERT INTO {table_name} + (file_id, page_num, page_count, content, type) + VALUES (%s, %s, %s, %s, %s) + ''' + file_id = parser_info['file_id'] + page_num = int(parser_info['page_num']) + page_count = int(parser_info['page_count']) + content = json.dumps(parser_info['content'], ensure_ascii=False) + type = parser_info['type'] + data_to_insert = (file_id, page_num, page_count, content, type) + cursor.execute(insert_query, data_to_insert) + conn.commit() + + +def delete_database(file_id): + try: + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, + database=MYSQL_DB + ) + + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + + truncate_query = [ + "delete from measure_parse_process where file_id = %s;", + "delete from measure_parser_info where file_id = %s;", + "delete from ori_measure_list where file_id = %s;", + "delete from measure_list where file_id = %s;", + "delete from word_parse_process where file_id = %s;", + "delete from table_unit_info where file_id = %s;", + # "delete from a where file_id = %s;", + # "delete from b where file_id = %s;", + ] + #file_id = file_id + for truncate in truncate_query: + cursor.execute(truncate,(file_id,)) + conn.commit() + except Exception as e: + applog.error(f'删除失败,原因是{e}') +def delete_to_run(conn,cursor,file_id): + try: + truncate_query = [ + "delete from ori_measure_list where file_id = %s;", + "delete from measure_list where file_id = %s;", + "delete from check_measure_list where file_id = %s;", + "delete from check_measure_detail_list where file_id = %s;", + # "delete from table_unit_info where file_id = %s;", + # "delete from pdf_parse_process where file_id = %s;", + # "delete from table_unit_info where file_id = %s;", + # "delete from a where file_id = %s;", + # "delete from b where file_id = %s;", + ] + #file_id = file_id + for truncate in truncate_query: + cursor.execute(truncate,(file_id,)) + conn.commit() + except Exception as e: + applog.error(f'删除失败,原因是{e}') + +def insert_word_text_info(file_id,table_info): + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, + database=MYSQL_DB + ) + cursor = conn.cursor(buffered=True) + + # 执行SQL语句,插入数据 + insert_query = ''' + INSERT INTO word_text_info + (file_id, page_num, text) + VALUES (%s, %s, %s) + ''' + + data_to_insert = [(file_id, int(line["index"]),int(line["data"])) for line in table_info] + cursor.executemany(insert_query,data_to_insert) + + conn.commit() + +def process_time(file_id,type,time,start_time,end_time): + conn = mysql.connector.connect( + host= MYSQL_HOST, + user= MYSQL_USER, + password= MYSQL_PASSWORD, + database= MYSQL_DB + ) + cursor = conn.cursor(buffered=True) + time = round(time, 2) + start_time = datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S') + end_time = datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S') + insert_query = ''' + insert into word_process_time + (file_id,type,time,start_time,end_time) + values (%s, %s, %s,%s,%s) + ''' + data_insert = (file_id,type,time,start_time,end_time) + cursor.execute(insert_query,data_insert) + conn.commit() + + +def batch_insert_page_text(table_info, conn, cursor, table_name): + file_id = table_info['file_id'] + page_num = int(table_info['page_num']) + text_lines = table_info['text'] + + insert_query = f''' + INSERT INTO {table_name} + (file_id, page_num, text) + VALUES (%s, %s, %s) + ''' + data_to_insert = [(file_id, page_num, text_lines) ] + cursor.executemany(insert_query, data_to_insert) + + conn.commit() +def file_type_check(file_id): + conn = mysql.connector.connect( + host= MYSQL_HOST, + user= MYSQL_USER, + password= MYSQL_PASSWORD, + database= MYSQL_DB + ) + cursor = conn.cursor(buffered=True) + try: + select_query = ''' + SELECT report_type FROM report_check WHERE id = %s + ''' + cursor.execute(select_query, (file_id,)) + record = cursor.fetchone() + if record and record[0] == 5: + return True + return False + finally: + cursor.close() + conn.close() +def file_type_check_v2(file_id): + conn = mysql.connector.connect( + host= MYSQL_HOST, + user= MYSQL_USER, + password= MYSQL_PASSWORD, + database= MYSQL_DB + ) + cursor = conn.cursor(buffered=True) + try: + select_query = ''' + SELECT report_type FROM report_check WHERE id = %s + ''' + cursor.execute(select_query, (file_id,)) + record = cursor.fetchone() + return record[0] + # if record and == 5: + # return True + # return False + finally: + cursor.close() + conn.close() + +def word_title_insert_mysql(file_id,title_array): + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, + database=MYSQL_DB + ) + cursor = conn.cursor(buffered=True) + data_to_insert = [(file_id, line["data"], int(line["index"]),int(line["depth"])) for line in title_array] + + # 插入语句 + insert_query = """ + INSERT INTO word_title_info (file_id, title, page_num, depth) + VALUES (%s, %s, %s, %s) + """ + + cursor.executemany(insert_query, data_to_insert) + + conn.commit() + cursor.close() + conn.close() + + +def get_file_info_from_mysql(file_id): + conn = mysql.connector.connect( + host= MYSQL_HOST, + user= MYSQL_USER, + password= MYSQL_PASSWORD, + database= MYSQL_DB + ) + #cursor = conn.cursor(buffered=True) + cursor = conn.cursor(dictionary=True) + select_query = """ + SELECT title, page_num, depth + FROM pdf_title_info + WHERE file_id = %s + """ + + cursor.execute(select_query, (file_id,)) + result = cursor.fetchall() + cursor.close() + conn.close() + return result diff --git a/zzb_data_word/main_word.py b/zzb_data_word/main_word.py new file mode 100644 index 0000000..44a0f26 --- /dev/null +++ b/zzb_data_word/main_word.py @@ -0,0 +1,823 @@ +import re +import os,time +from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP +import mysql.connector +import utils +from pymilvus import MilvusClient + +import numpy as np +from multiprocessing import Process +from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD +import redis +import db_service_word +from zzb_logger import applog + + + +''' +已知发现问题: +1.表格和文本提取错误,表格和文本内容在同一页,文本在前表格在后的,文本数据提取不出来 +2.大模型抽取错,抽取2023年营业收入:主营业务收入、分产品的营业收入、变动比例被错误抽取 +3.表格中的指标被抽取成文本中 +4.大模型抽取指标时,语义完全不同的指标被放一起,考虑用向量相似度来判断 +''' + +# 数据处理流程 +# 1. get_table_range多进程获取所有表格及表格上下文,输出为一个完整的列表 +# 2. 单进程进行表格分页合并,输出一个新的表格对象数组 +# 3. 新表格对象数组多进程开始原来的解析指标流程 + + +STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入' +PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元' +MUILT_PATTERN = '调整前' +#unit_pattern = re.compile(r'单位[:|:]?(百万元|千万元|亿元|万元|千元|元)') +unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离 +#获取指标的表头信息 +def get_col_num_info(array,row_num,col_num,x,y): + num_info="" + for j in range(col_num): + if len(str(array[x][j])) > 50: + continue + num_info += str(array[x][j]) + + return num_info.replace('%','') + +#获取指标的表头信息 +def get_row_num_info(array,row_num,col_num,x,y): + num_info="" + + for i in range(row_num): + if len(str(array[i][y])) > 50: + continue + num_info += str(array[i][y]) + + return num_info + +def table_converter(table): + table_string = '' + # 遍历表格的每一行 + for row_num in range(len(table)): + row = table[row_num] + # 从warp的文字删除线路断路器 + cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row] + # 将表格转换为字符串,注意'|'、'\n' + table_string+=(','.join(cleaned_row)) + # 删除最后一个换行符 + table_string = table_string[:-1] + return table_string + +# 检查第二列是否为中文字符的函数 +def is_chinese(s): + return bool(re.search('[\u4e00-\u9fff]', s)) + +def check_table(arr): + split_index = None + for i in range(arr.shape[0]): + # 过滤掉第一行 + if arr[i, 0] == "" and is_chinese(arr[i, 1]) and i > 1: + split_index = i + break + if split_index is not None: + arr1 = arr[:split_index] + arr2 = arr[split_index:] + return [arr1, arr2] + else: + return [arr] + +def safe_process_array(func, arr): + try: + return func(arr) + except Exception as e: + print(f"这个函数出现了报错{func.__name__}: {e}") + return arr # 返回原数组以便继续后续处理 + + +# 单独针对三季报的资产负债表识别合并问题 +def process_array(arr, years=['2022', '2023', '2024'], keyword='项目'): + # 确保 row 有足够的列来存储分割后的数据 + def ensure_columns(row, num_columns): + while len(row) < num_columns: + row.append('') + + def is_valid_header(header, years, keyword): + header_text = header.lower() # 转小写以提高匹配的鲁棒性 + return any(year in header_text for year in years) and keyword in header_text + + # 对字符串进行清理 + def clean_text(text): + # 去除“年”和“月”相邻的空格 + text = re.sub(r'\s*(年|月)\s*', r'\1', text) + # 去除“日”左侧相邻的空格 + text = re.sub(r'\s*日', '日', text) + return text + + # 将 numpy 数组转换为列表 + arr = arr.tolist() if isinstance(arr, np.ndarray) else arr + + if len(arr[0]) == 1 and is_valid_header(arr[0][0], years, keyword): + remaining_value = arr[0][0] + + # 清理字符串 + remaining_value = clean_text(remaining_value) + + parts = remaining_value.split() + + ensure_columns(arr[0], len(parts)) + for i in range(len(parts)): + arr[0][i] = parts[i] + + header_columns = len(arr[0]) + + for i in range(1, len(arr)): + if len(arr[i]) == 1: + remaining_value = arr[i][0] + parts = remaining_value.split() + if len(parts) > header_columns: + parts = parts[:header_columns] + ensure_columns(arr[i], header_columns) + for j in range(len(parts)): + arr[i][j] = parts[j] + # 如果分割出的值不足,填充空值 + if len(parts) < header_columns: + for j in range(len(parts), header_columns): + arr[i][j] = '' + + return arr + + +# 三季报中针对性修改,本报告期和年初至报告期末的两个上年同期进行区分 +def process_array_with_annual_comparison(arr, keywords=['本报告期', '年初至报告期末', '上年同期']): + def contains_all_keywords(header, keywords): + return all(keyword in header for keyword in keywords) + + def split_and_replace_occurrences(header, target, replacement): + # 找到所有 target 出现的位置 + indices = [i for i, x in enumerate(header) if x == target] + if len(indices) > 1: + split_index = len(indices) // 2 + for i in range(split_index): + header[indices[i]] = replacement + return header + + # 将 numpy 数组转换为列表 + arr = arr.tolist() if isinstance(arr, np.ndarray) else arr + + if len(arr) > 0 and len(arr[0]) > 0: + first_row = arr[0] + + if contains_all_keywords(first_row, keywords): + # 将 "上年同期" 拆分并替换 + first_row = split_and_replace_occurrences(first_row, '上年同期', '三季报中无需识别的上年同期') + arr[0] = first_row + + return arr + + +# 三季报的非经常损益的单独处理 +def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期'], target='计入当期损益的政府补助', + replacement='非经常性损益'): + # 检查第一行是否包含所有关键词 + def contains_all_keywords(header, keywords): + # return all(keyword in header for keyword in keywords) + return all(any(keyword in str(cell) for cell in header) for keyword in keywords) + + # 检查第一列中是否存在目标文本 + def contains_target_in_first_column(arr, target): + return any(target in str(item[0]) for item in arr) + + # 替换第一列中的特定值 + def replace_in_first_column(arr, target, replacement): + for i in range(len(arr)): + if arr[i][0] == target: + arr[i][0] = replacement + return arr + + # 将 numpy 数组转换为列表 + arr = arr.tolist() if isinstance(arr, np.ndarray) else arr + + if len(arr) > 0 and len(arr[0]) > 0: + first_row = arr[0] + + # 检查第一行和第一列的条件 + if contains_all_keywords(first_row, keywords) and contains_target_in_first_column(arr, target): + # 替换第一列中的 "合计" + arr = replace_in_first_column(arr, '合计', replacement) + + return arr + +# 处理表格数据 +def process_table(file_id, tables): + applog.info('Run task %s (%s)...' % (f'处理word文件中的table file_id:{file_id}', os.getpid())) + start = time.time() + + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, + database=MYSQL_DB + ) + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + + for t in tables: + try: + arr = np.array(t["data"]) + + arr = safe_process_array(process_array, arr) # 部分资产负债表合并问题 + arr = safe_process_array(process_array_with_annual_comparison, arr) # 复杂表格的优化"多个上年同期时处理" + arr = safe_process_array(process_array_with_grants, arr) # 三季报的非经常损益 + + arr = np.char.replace(arr, ' ', '') + arr = np.char.replace(arr, '\n', '') + arr = np.char.replace(arr, ',', '') + + arr_list = check_table(arr) + + for a in arr_list: + new_data = a.tolist() # 用于后面保存到数据库中 + new_data = utils.check_black_table_list(new_data) + rows, cols = a.shape + if rows == 1 and cols == 1: + continue + arr_str = ''.join([''.join(map(str, row)) for row in a]) + # 全量的数据先存入 word_parse_data表中 + db_service_word.insert_word_parse_process({ + 'file_id': file_id, + 'page_num': t["index"], + 'page_count': 100, + 'type': 'table', + 'content': { + 'page_num': t["index"], + 'table_index': t["index"], + "type": "table", + "data": new_data, + }}, conn, cursor, "word_parse_data") + + # 过滤掉不包含需抽取指标表格的文本 + matches = re.findall(STR_PATTERN, arr_str) + pattern = re.findall(PATTERN, arr_str) + muilt_pattern = re.findall(MUILT_PATTERN, arr_str) + + if len(matches) > 0 and len(muilt_pattern) < 5: + # if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern) < 5: + db_service_word.insert_word_parse_process({ + 'file_id': file_id, + 'page_num': t["index"], + 'page_count': 100, + 'type': 'parse_table', + 'content': { + 'page_num': t["index"], + 'table_index': t["index"], + "type": "table", + "data": new_data, + }}, conn, cursor,"word_parse_process") + except Exception as e: + applog.info(f'解析表格时出现了异常 {e} 内容为{t}') + cursor.close() + conn.close() + end = time.time() + applog.info('Task %s runs %0.2f seconds.' % (f'解析表格{file_id}', (end - start))) + +def text_in_table(top, tables_range, page_num): + if tables_range.get(page_num): + for range in tables_range[page_num]: + if top < range['top'] and top > range['buttom']: + return True + return False + +def get_text_type(text: str): + text = re.sub(r"\s", "", text) + first_re = '年度报告' + page_number_pattern = re.compile(r'^\d+(/\d+)?$') + + if re.search(first_re, text.strip()): + return 'page_header' + + if page_number_pattern.match(text.strip()): + return 'page_footer' + + if len(text) < 20 and text.endswith('页'): + return 'page_footer' + + return 'text' + +def check_report_type(file_id): + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, + database=MYSQL_DB + ) + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + """ + :return: 返回pdf文件中文本内容,不包括表格 + """ + select_year_select = f"""select report_type,year from report_check where id = {file_id}""" + cursor.execute(select_year_select) + record_select = cursor.fetchall() + if record_select: + report_type = record_select[0][0] + report_year = record_select[0][1] + cursor.close() + conn.close() + return int(report_type),report_year + else: + return None + + + +# 通过text的index 获取最近的一个table的index,并校验中间text文本的长度和数量 +def get_next_table_index(text_index, texts, tables): + try: + for table in tables: + if table["index"] > text_index and table["type"] == "table": + table_index = table["index"] + total_len = sum(len(texts.get(key).get("data").replace(" " ,"")) for key in range(text_index + 1, table_index)) + # 最近一个表格的索引 在10个以内 + if (table_index - text_index) < 10 and total_len < 50: + # 在判断所有的字符串加起来有是否小于50个字 + return table_index + else: + return text_index + except StopIteration: + applog.error("Target not found") + return text_index + + +#处理文本数据 +def process_text_content(file_id,texts,tables,full_texts,type =0): + applog.info('Run task %s (%s)...' % (f'处理word文件中的 text file_id:{file_id}', os.getpid())) + conn = mysql.connector.connect( + host=MYSQL_HOST, + user=MYSQL_USER, + password=MYSQL_PASSWORD, + database=MYSQL_DB + ) + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + """ + :return: 返回pdf文件中文本内容,不包括表格 + """ + report_type, report_year = check_report_type(file_id) + texts_dict = {t["index"]:t for t in full_texts} + + query = "SELECT title_list,button_list FROM table_title_list WHERE report_year = %s" + cursor_dict = conn.cursor(dictionary=True) + cursor_dict.execute(query, (report_year,)) + result = cursor_dict.fetchone() + title_list = result['title_list'] + button_list = result['button_list'] + + try: + for t in texts: + line_text = t["data"] + line_text = re.sub(r"\s", "", line_text) + line_text = re.sub(r":", ":", line_text) + index = t["index"] + + if len(re.findall('母公司|现金流量表补充', line_text)) > 0: + db_service_word.insert_measure_parser_info({ + 'file_id': file_id, + 'content': get_next_table_index(index,texts_dict,tables), + 'type': 'parent_com', + }, conn, cursor) + + # 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位 + table_info = {} + if (utils.check_table_title_black_list(line_text, title_list) + or utils.check_table_title_black_list_button(line_text,button_list)): + db_service_word.insert_measure_parser_info({ + 'file_id': file_id, + 'content': get_next_table_index(index,texts_dict,tables), + 'type': 'table_index', + }, conn, cursor) + if utils.check_table_title_black_list_measure(line_text): + db_service_word.insert_measure_parser_info_measure({ + 'file_id': file_id, + 'content': get_next_table_index(index, texts_dict,tables), + 'type': 'measure_index', + }, conn, cursor, line_text) + + + if re.findall(unit_pattern, line_text): + # 为单位 + table_info = get_table_unit_info(file_id,line_text,t["index"],t["index"]+1) + + db_service_word.insert_table_unit_info_v1(table_info,conn,cursor) + + if utils.check_table_title_black_list_measure(line_text): + db_service_word.insert_measure_parser_info_measure({ + 'file_id': file_id, + 'content': f"{t['index']}_1", + 'type': 'measure_index', + }, conn, cursor, line_text) + + if not utils.pdf_text_flag(line_text): + if utils.check_line_text(line_text): + db_service_word.insert_word_parse_process({ + 'file_id': file_id, + 'page_num' : t["index"], + 'page_count' : 100, + 'type' : 'parse_table', + 'content':{ + 'page_num' : t["index"], + 'table_index' : t["index"], + "type" : "text", + 'content' : line_text, + }},conn,cursor,"word_parse_process") + # 给慎用词校验用 + db_service_word.insert_word_parse_process({ + 'file_id': file_id, + 'page_num': t["index"], + 'page_count': 100, + 'type': 'text', + 'content': { + 'page_num': t["index"], + 'table_index': t["index"], + "type": "text", + 'content': line_text, + }}, conn, cursor, "word_parse_data") + + table_name = "word_text_info" + if type == 1: + table_name = "id_text_info" + # 写入数据库 传入表名 + db_service_word.batch_insert_page_text({ + 'file_id': file_id, + 'page_num' : t["index"], + 'text' : line_text + },conn,cursor, table_name) + + + for t in tables: + page_num = t["index"] + for lines in t["data"]: + lines = list(set(lines)) + for line in lines: + if len(line) == 0: + continue + db_service_word.batch_insert_page_text({ + 'file_id': file_id, + 'page_num' : page_num, + 'text' : line + },conn,cursor,"word_text_info") + + + except Exception as e: + applog.error(f'文本处理异常{e}') + + + +def get_table_unit_info(file_id,line_text,page_num,table_index): + table_info = {} + table_info['file_id'] = file_id + match = unit_pattern.search(line_text) + if match: + unit = match.group(2) + table_info['unit'] = unit + + table_info['page_num'] = page_num + table_info['table_index'] = table_index + + return table_info + + +def get_table_text_info(file_id,line_text,page_num,table_index): + table_info = {} + table_info['file_id'] = file_id + table_info['text_info'] = line_text + table_info['page_num'] = page_num + table_info['table_index'] = table_index + + return table_info + +# 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx +def get_table_measure(file_id, word_tables, record_range): + """ + :return: pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx + """ + try: + redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) + conn = mysql.connector.connect( + host = MYSQL_HOST, + user = MYSQL_USER, + password = MYSQL_PASSWORD, + database = MYSQL_DB + ) + + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + conn_app = mysql.connector.connect( + host = MYSQL_HOST_APP, + user = MYSQL_USER_APP, + password = MYSQL_PASSWORD_APP, + database = MYSQL_DB_APP + ) + + # 创建一个cursor对象来执行SQL语句 + cursor_app = conn_app.cursor(buffered=True) + + select_year_select = f"""select report_type,year from report_check where id = {file_id}""" + cursor.execute(select_year_select) + record_select = cursor.fetchall() + report_type = record_select[0][0] + report_year = record_select[0][1] + + client = MilvusClient( + uri= MILVUS_CLIENT + ) + applog.info('提取指标任务 %s (%s)...' % (record_range, os.getpid())) + start = time.time() + + record_start = record_range.split('-')[0] + record_end = record_range.split('-')[1] + for index in range(int(record_start),int(record_end)): + t = word_tables[index][0] + measure_obj =[] + data_dict = {} + measure_list = [] + try: + arr = np.array(t["data"]) + rows, cols = arr.shape + if rows == 1 and cols == 1: + continue + + row_num , col_num = -1 , -1 + + # 使用嵌套循环遍历数组,获取第一个数值位置 + for i in range(rows): + for j in range(cols): + if j == 0 or i == 0:#防止第一列识别出数字 + continue + measure_value_config = str(arr[i, j]).replace('(','').replace(')','') + + + if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config): + if j == cols-1: + row_num, col_num = i, j + break + elif (re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config) + or measure_value_config == '-'): + row_num, col_num = i, j + break + else: + continue + break + # 遍历数值二维数组,转成带语义的指标 + if row_num != -1 and col_num != -1: + for i in range(row_num,arr.shape[0]): + for j in range(col_num,arr.shape[1]): + measure_value = str(arr[i, j]).replace('%','').replace('(','-').replace(')','') + if measure_value == '-' or measure_value == '' or len(measure_value) > 20: + continue + else: + row_num_info = get_row_num_info(arr,row_num,col_num,i,j) + col_num_info = get_col_num_info(arr,row_num,col_num,i,j) + + #如果上表头为空则认为是被截断,除了研发投入特殊处理其它过滤 + if row_num_info in ('','-',')',')'): + continue + + #特殊处理非经常性损益合计和非经常性损益净额同时出现时保留净额 + if col_num_info == '非经常性损益合计': + continue + + if utils.check_pdf_measure_black_list(f"{col_num_info}{row_num_info}"): + continue + + #去掉没有周期的指标 + if utils.check_pdf_measure(f"{col_num_info}{row_num_info}"): + continue + + #判断上表头和左表头周期是否一致,不一致过滤 + row_period = utils.get_period_type_other(row_num_info, report_year) + col_period = utils.get_period_type_other(col_num_info, report_year) + if(row_period != col_period and row_period != 'c_n' and col_period != 'c_n'): + continue + units_mapping = { + "百万元": "百万元", + "千万元": "千万元", + "亿元": "亿元", + "万元": "万元", + "千元": "千元", + "元": "元", + "元/股": "元" + } + row_num_info = row_num_info.replace('%','增减') + #num_info = f"{col_num_info}{row_num_info}".replace('()','').replace('加:','').replace('减:','').replace('%','') + num_info = utils.get_clean_text(f"{row_num_info}{col_num_info}") + num_info_bak = utils.get_clean_text(f"{col_num_info}{row_num_info}") + measure_unit = '' + #"%": "同期增减" + combined_info = f"{row_num_info} {col_num_info}" + # for unit in units_mapping: + # if unit in row_num_info: + # measure_unit = units_mapping[unit] + # break + if utils.get_percent_flag(row_num_info) == '1': + measure_unit = '' + else: + for unit in units_mapping: + if re.search(rf'\(\s*{unit}(\s*人民币)?\s*\)|\(\s*{unit}(\s*人民币)?\s*\)', combined_info) or (re.search(rf'{unit}', combined_info) and any(re.search('单位', item) for item in arr[0])): + measure_unit = units_mapping[unit] + break + measure_list.append({ + 'measure_name': num_info, + 'measure_value': measure_value, + 'measure_unit':measure_unit, + }) + measure_list.append({ + 'measure_name': num_info_bak, + 'measure_value': measure_value, + 'measure_unit':measure_unit, + }) + + if not redis_client.exists(f'parsed_measure_count_{file_id}'): + redis_client.set(f'parsed_measure_count_{file_id}', 0) + + redis_client.incr(f'parsed_measure_count_{file_id}') + + if len(measure_list) > 0: + data_dict["measure_list"] = measure_list + data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}" + data_dict['file_id'] = file_id + measure_obj.append(data_dict) + db_service_word.insert_measure_data_to_milvus(client,measure_obj,cursor_app,conn_app) + except Exception as e: + applog.error(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}") + applog.error(f"错误是:{e}") + end = time.time() + applog.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start))) + except Exception as e: + applog.error(f'这个错误是{e},所在的位置是{record_start}-{record_end}') + record_start = record_range.split('-')[0] + record_end = record_range.split('-')[1] + for index in range(int(record_start),int(record_end)): + t = word_tables[index] + try: + arr = np.array(t['data']) + except Exception as e: + applog.error(f'这个错误是{e}的arr的值是{arr}') + + + finally: + redis_client.close() + client.close() + cursor.close() + conn.close() + cursor_app.close() + conn_app.close() + + +#指标归一化处理 + +def update_measure_data(file_id,file_path,parent_table_pages): + conn = mysql.connector.connect( + host = MYSQL_HOST, + user = MYSQL_USER, + password = MYSQL_PASSWORD, + database = MYSQL_DB + ) + + # 创建一个cursor对象来执行SQL语句 + cursor = conn.cursor(buffered=True) + # #通过向量查询指标 + conn_app = mysql.connector.connect( + host = MYSQL_HOST_APP, + user = MYSQL_USER_APP, + password = MYSQL_PASSWORD_APP, + database = MYSQL_DB_APP + ) + + # 创建一个cursor对象来执行SQL语句 + cursor_app = conn_app.cursor(buffered=True) + applog.info(f'目录黑名单为:{parent_table_pages}') + # db_service_word.delete_to_run(conn,cursor,file_id) + db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path) + + # #指标归一化处理 + db_service_word.update_ori_measure(conn,cursor,file_id) + # db_service.delete_database(conn_app,cursor_app,file_id) + cursor.close() + conn.close() + cursor_app.close() + conn_app.close() + +def merge_consecutive_arrays(word_info): + merged_objects = [] + temp_list = [] + + for info_obj in word_info: + try: + if info_obj['type'] == 'table': + # 如果对象是表格,将其元素添加到临时列表中 + data = info_obj['data'] + if not data: + continue + first_row = data[0] + if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0: + temp_list.append(info_obj) + elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0: + merged_objects.append(temp_list) + temp_list = [] + temp_list.append(info_obj) + elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0: + temp_data = temp_list[-1]['data'] + temp_data = list(temp_data) + for row in list(info_obj['data']): + temp_data.append(row) + info_obj['data'] = temp_data + temp_list.clear() + temp_list.append(info_obj) + + + except Exception as e: + + applog.error(f"解析数据错误: {e}") + + if temp_list: + merged_objects.append(temp_list) + + return merged_objects + +def merge_consecutive_arrays_v1(pdf_info): + merged_objects = [] + temp_array = {} + + def is_same_dimension(data1, data2): + # 检查两个表的每行长度是否相同 + if len(data1) != len(data2): + return False + return all(len(row1) == len(row2) for row1, row2 in zip(data1, data2)) + + for info_obj in pdf_info: + try: + if info_obj['type'] == 'table': + if not temp_array: + # 如果临时列表为空,则初始化临时列表 + temp_array = info_obj + else: + # 检查当前表与临时列表中的表是否同维度 + if is_same_dimension(temp_array['data'], info_obj['data']): + # 如果是同维度,则合并数据 + temp_array['data'].extend(info_obj['data']) + else: + # 如果不是同维度,将现有临时列表添加到结果中,并重置临时列表 + merged_objects.append(temp_array) + temp_array = info_obj + else: + # 如果对象不是表格,检查临时列表是否非空 + if temp_array: + # 将临时列表中的元素合并成一个数组,并添加到新的对象列表中 + merged_objects.append(temp_array) + temp_array = {} # 重置临时列表 + except Exception as e: + applog.error(f"解析数据错误: {e}") + + # 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中 + if temp_array: + merged_objects.append(temp_array) + + return merged_objects +def start_table_measure_job(file_id): + conn_app = mysql.connector.connect( + host = MYSQL_HOST_APP, + user = MYSQL_USER_APP, + password = MYSQL_PASSWORD_APP, + database = MYSQL_DB_APP + ) + + # 创建一个cursor对象来执行SQL语句 + cursor_app = conn_app.cursor(buffered=True) + + select_process_query = ''' + select DISTINCT content from word_parse_process WHERE file_id = '{file_id}' and type='parse_table' order by page_num + '''.format(file_id=file_id) + cursor_app.execute(select_process_query) + records = cursor_app.fetchall() + word_info = [] + for record in records: + word_info.append(eval(record[0])) + + # 获取table 数据 + word_tables = merge_consecutive_arrays(word_info) + redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) + + redis_client.set(f'measure_count_{file_id}', len(word_tables)) + + cursor_app.close() + conn_app.close() + redis_client.close() + + records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT) + processes = [] + for record_range in records_range_parts: + p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,)) + processes.append(p) + p.start() + + for p in processes: + p.join() + diff --git a/zzb_data_word/parse_word.py b/zzb_data_word/parse_word.py new file mode 100644 index 0000000..a047544 --- /dev/null +++ b/zzb_data_word/parse_word.py @@ -0,0 +1,269 @@ +from docx import Document +import json +from docx.oxml.table import CT_Tbl +from docx.oxml.text.paragraph import CT_P +from lxml import etree +import os +import zipfile + +RESULT_TYPE_TEXT = 'text' +RESULT_TYPE_TABLE = 'table' + +def build_result(result_type, index, data): + return { + 'type': result_type, + 'index': index, + 'data': data + } + +def build_catalog_result(index, depth, data): + return { + 'index': index, + 'depth': depth, + 'data': data + } + +# 解析docx文件中的XML内容 +def get_xml_content(docx_filename, xml_filename): + with zipfile.ZipFile(docx_filename) as z: + return z.read(xml_filename) + +def parse_paragraph(paragraph, index, namespaces): + paragraph_text = paragraph.text.strip() if paragraph else '' + if paragraph_text: + return build_result(RESULT_TYPE_TEXT, index, paragraph_text) + return None + +def parse_table(table, index): + table_data = [] + for row in table.rows: + row_data = [cell.text for cell in row.cells] + table_data.append(row_data) + return build_result(RESULT_TYPE_TABLE, index, table_data) + +def parse_paragraph_element(paragraph_element, index, namespaces): + paragraph_xml = etree.fromstring(paragraph_element.xml) + paragraph_text = ''.join(paragraph_xml.xpath('//w:t/text()', namespaces=namespaces)).strip() + if paragraph_text: + return build_result(RESULT_TYPE_TEXT, index, paragraph_text) + return None + +def parse_table_element(table_element, index, namespaces): + table_xml = etree.fromstring(table_element.xml) + table_data = [] + for row in table_xml.xpath('//w:tr', namespaces=namespaces): + row_data = [] + for cell in row.xpath('./w:tc | ./w:sdt', namespaces=namespaces): + cell_text = ''.join(cell.xpath('.//w:t/text()', namespaces=namespaces)).strip() + grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces) + grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1 + if grid_span > 1: + row_data.extend([cell_text] * grid_span) + else: + row_data.append(cell_text) + table_data.append(row_data) + return build_result(RESULT_TYPE_TABLE, index, table_data) + +def add_to_catalog(element_xml, index, catalog_content, namespaces, paragraph_text, heading_styles): + p_element = etree.fromstring(element_xml) + # outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces) + # if outlineLvl: + # level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')) + # catalog_content.append(build_catalog_result(index, level, paragraph_text)) + level = is_heading_paragraph(p_element, heading_styles, namespaces) + if level != -1: + catalog_content.append(build_catalog_result(index, level, paragraph_text)) +# 检查段落是否为标题样式 +def is_heading_paragraph(paragraph, heading_styles, namespaces): + pPr = paragraph.find('.//w:pPr', namespaces=namespaces) + if pPr is not None: + pStyle = pPr.find('.//w:pStyle', namespaces=namespaces) + pOutLineLvl = pPr.find('.//w:outlineLvl', namespaces=namespaces) + if pStyle is not None: + style_val = pStyle.get(f"{{{namespaces['w']}}}val") + if style_val.isdigit(): + return int(style_val) + if pOutLineLvl is not None: + outLineLvl_val = pOutLineLvl.get(f"{{{namespaces['w']}}}val") + if outLineLvl_val.isdigit(): + return int(outLineLvl_val) + 1 + # if pStyle is not None and pStyle.get(ns['w'] + 'val') in heading_styles: + # if style_val > 0: + # return True + return -1 + +def get_paragraph_text(paragraph_element, namespaces): + paragraph_text = '' + for run in paragraph_element.findall('.//w:r', namespaces=namespaces): + for text in run.findall('.//w:t', namespaces=namespaces): + paragraph_text += text.text if text.text is not None else '' + return paragraph_text + +def add_to_catalog_paragraph(text, index, catalog_content, namespaces): + # 添加段落到目录 + catalog_content.append(build_catalog_result(index, 1, text)) # 假设默认级别为1 + +def parse_sdt_catalog(sdt_element, catalog_content, index, namespaces): + sdt_content = sdt_element.find('.//w:sdtContent', namespaces=namespaces) + if sdt_content is not None: + for child in sdt_content: + if child.tag.endswith('p'): # 内容控件中的段落 + paragraph_text = get_paragraph_text(child, namespaces) + if paragraph_text.strip(): # 检查文本是否为空 + add_to_catalog_paragraph(paragraph_text, index, catalog_content, namespaces) + index += 1 # 更新索引 + elif child.tag.endswith('tbl'): # 内容控件中的表格 + # 处理表格内容(如果需要) + pass + elif child.tag.endswith('sdt'): # 嵌套的内容控件 + index = parse_sdt_catalog(child, catalog_content, index, namespaces) # 递归解析嵌套的内容控件 + return index + +def parse_docx(docx_path): + try: + document = Document(docx_path) + styles_xml = get_xml_content(docx_path, 'word/styles.xml') + except Exception as e: + print(f"Error loading document: {e}") + return None, None + + doc_content = [] # 内容(文本+表格) + catalog_content = [] # 目录 + current_index = 1 # 维护全局的 index 变量 + paragraph_index = 0 + table_index = 0 + # 获取整个文档的XML内容 + xml_root = document.part.element + namespaces = xml_root.nsmap + + # 获取所有标题样式 + styles_root = etree.fromstring(styles_xml) + heading_styles = set() + for style in styles_root.xpath('//w:style', namespaces=namespaces): + style_type = style.get(namespaces['w'] + 'type') + if style_type == 'paragraph' and style.get(namespaces['w'] + 'styleId').startswith('Heading'): + heading_styles.add(style.get(namespaces['w'] + 'styleId')) + + # 遍历文档中的所有元素 + for i, element in enumerate(document.element.body): + if isinstance(element, CT_P): # 段落 + paragraph_result = parse_paragraph_element(element, current_index, namespaces) + if paragraph_result: + doc_content.append(paragraph_result) + # 判断是否为目录,是就插入目录内容 + paragraph = document.paragraphs[paragraph_index] + add_to_catalog(paragraph._element.xml, current_index, catalog_content, namespaces, paragraph.text, heading_styles) + current_index += 1 # 更新 index + paragraph_index += 1 + elif isinstance(element, CT_Tbl): # 表格 + table_result = parse_table_element(element, current_index, namespaces) + if table_result: + doc_content.append(table_result) + current_index += 1 # 更新 index + table_index += 1 + elif element.tag.endswith('sdt'): # 内容控件 + current_index = parse_sdt(element, doc_content, current_index, namespaces, catalog_content, heading_styles) # 更新索引 + + return json.dumps(doc_content, indent=4, ensure_ascii=False), json.dumps(catalog_content, indent=4, ensure_ascii=False) + + + +def parse_sdt(sdt_element, doc_content, current_index, namespaces, catalog_content, heading_styles): + sdtContent = sdt_element.find('.//w:sdtContent', namespaces=namespaces) + if sdtContent is not None: + for child in sdtContent: + if child.tag.endswith('p'): # 内容控件中的段落 + paragraph_text = '' + for run in child.findall('.//w:r', namespaces=namespaces): + for text in run.findall('.//w:t', namespaces=namespaces): + paragraph_text += text.text if text.text is not None else '' + if paragraph_text.strip(): # 检查文本是否为空 + doc_content.append(build_result(RESULT_TYPE_TEXT, current_index, paragraph_text.strip())) + # 判断是否为目录,是就插入目录内容 + add_to_catalog(child.xml, current_index, catalog_content, namespaces, paragraph_text, heading_styles) + current_index += 1 # 更新索引 + elif child.tag.endswith('tbl'): # 内容控件中的表格 + table_data = [] + merged_cells = {} # 用于记录跨行单元格的信息 + for row_idx, row in enumerate(child.findall('.//w:tr', namespaces=namespaces)): + row_data = [] + for col_idx, cell in enumerate(row.findall('.//w:tc', namespaces=namespaces)): + cell_text = '' + for run in cell.findall('.//w:r', namespaces=namespaces): + for text in run.findall('.//w:t', namespaces=namespaces): + cell_text += text.text if text.text is not None else '' + + # 检查单元格是否跨列 + grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces) + grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1 + if grid_span > 1: + row_data.extend([cell_text.strip()] * grid_span) + else: + row_data.append(cell_text.strip()) + + # 检查单元格是否跨行 + v_merge_xpath = etree.XPath('.//w:tcPr/w:vMerge/@w:val', namespaces=namespaces) + v_merge = v_merge_xpath(cell) + if v_merge and v_merge[0] == 'restart': + merged_cells[(row_idx, col_idx)] = (int(grid_span), 1) + elif v_merge and v_merge[0] == 'continue': + if (row_idx - 1, col_idx) in merged_cells: + merged_cells[(row_idx - 1, col_idx)] = (merged_cells[(row_idx - 1, col_idx)][0], merged_cells[(row_idx - 1, col_idx)][1] + 1) + # 跨行单元格不需要再次添加到 row_data 中 + else: + # 只有非跨行单元格才需要添加到 row_data 中 + pass + + # 处理跨行单元格 + for (r, c), (col_span, row_span) in list(merged_cells.items()): + if r < row_idx: + for i in range(row_span): + if r + i == row_idx: + row_data[c:c] = [row_data[c]] * (col_span - 1) + break + if r + row_span - 1 == row_idx: + del merged_cells[(r, c)] + + table_data.append(row_data) + if table_data: # 检查表格数据是否为空 + doc_content.append(build_result(RESULT_TYPE_TABLE, current_index, table_data)) + current_index += 1 # 更新索引 + elif child.tag.endswith('sdt'): # 嵌套的内容控件 + current_index = parse_sdt(child, doc_content, current_index, namespaces, catalog_content, heading_styles) # 递归解析嵌套的内容控件 + return current_index # 返回更新后的索引 + +def split_text_table(json_data): + # 分组 + text_elements = [element for element in json_data if element['type'] == 'text'] + table_elements = [element for element in json_data if element['type'] == 'table'] + + # 转换为JSON字符串 + text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4) + table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4) + + return text_elements_json, table_elements_json + +def append_to_file(file_path, text): + try: + with open(file_path, 'a', encoding='utf-8') as file: + file.write(text + '\n') + except Exception as e: + print(f"Error writing to file: {e}") + +if __name__ == "__main__": + current_directory = os.getcwd() + docx_relative_path = '101.docx' + file_relative_path = 'file\\docx\\test1.txt' + docx_path = os.path.join(current_directory, docx_relative_path) + file_path = os.path.join(current_directory, file_relative_path) + try: + parsed_content, catalog_content = parse_docx(docx_path) + if parsed_content and catalog_content: + json_parsed_content = json.loads(parsed_content) + text_elements_json, table_elements_json = split_text_table(json_parsed_content) + + append_to_file(file_path, text_elements_json) + append_to_file(file_path, table_elements_json) + append_to_file(file_path, catalog_content) + except Exception as e: + print(f"Error parse_docx: {e}") \ No newline at end of file diff --git a/zzb_data_word/redis_service.py b/zzb_data_word/redis_service.py new file mode 100644 index 0000000..b767944 --- /dev/null +++ b/zzb_data_word/redis_service.py @@ -0,0 +1,17 @@ +import redis +# 从 MySQL 表中读取数据并写入 Redis +def read_from_file_and_write_to_redis(redis_client,ori_measure_id,measure_vector): + # Redis 连接配置 + redis_client.hset('measure_config',ori_measure_id, measure_vector) + +# 从 Redis 中读取数据 +def read_from_redis(redis_client,ori_measure_id): + # 获取所有键 + return redis_client.hget('measure_config',ori_measure_id).decode() + +# if __name__ == "__main__": +# # redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6) +# redis_client = redis.Redis(host='124.70.129.232', port=6379, password='Xgf_redis', db=6) +# +# value = read_from_redis(redis_client,"92b44ffb50b6ab2068f5de447c9925") +# print(value) \ No newline at end of file diff --git a/zzb_data_word/requirements.txt b/zzb_data_word/requirements.txt new file mode 100644 index 0000000..a38c143 --- /dev/null +++ b/zzb_data_word/requirements.txt @@ -0,0 +1,14 @@ +camelot-py==0.11.0 +pdfminer.six==20221105 +PyPDF2==3.0.1 +pdfplumber==0.10.3 +pymilvus==2.3.3 +mysql-connector-python==8.3.0 +dashscope==1.17.0 +fastapi +pydantic +uvicorn +redis +ghostscript +opencv-python-headless +python-docx \ No newline at end of file diff --git a/zzb_data_word/utils.py b/zzb_data_word/utils.py new file mode 100644 index 0000000..a735aac --- /dev/null +++ b/zzb_data_word/utils.py @@ -0,0 +1,818 @@ +#coding=utf-8 + +import dashscope +from http import HTTPStatus +from pymilvus import MilvusClient +import json +from datetime import datetime +import re,os,time +import requests +import config +import numpy as np +from docx2pdf import convert +from config import api_key + + +dashscope.api_key = api_key + + +def get_md5(str): + import hashlib + m = hashlib.md5() + m.update(str.encode('utf-8')) + return m.hexdigest() + +def embed_with_str(input): + retry = 0 + max_retry = 5 + t = 0.1 + while retry < max_retry: + #阿里接口限流 + time.sleep(t) + resp = dashscope.TextEmbedding.call( + model=dashscope.TextEmbedding.Models.text_embedding_v2, + input=input) + if resp.status_code == HTTPStatus.OK: + return resp + elif resp.status_code == 429: + print(f'触发限流,等待{t}秒后重试') + retry += 1 + t+=0.1 + else: + print(f'请求失败,状态码:{resp.status_code}') + return None + print('重试超过上限') + return None + +#如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。 +#如果存在季度关键词,就将括号内容替换为季度 +#如果存在‘±’,就将括号内容替换为同期增减 +#其他情况,就删掉括号内全部内容 +def get_clean_text(text): + text = text.replace('流动资产:','').replace('半年度','上半年') + #先对几个半年报的词做整理,防止向量识别不出来 + terms = ["货币资金", "应收账款",'应付账款'] + #这个是不要合计的 + terms_2 = ["固定资产","短期借款","合同负债","在建工程","商誉","存货"] + #这个是需要调换位置的指标 + #terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"] + #不可以出现同比之类的 + terms_4 = ['比', '率', '占','至','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资'] + dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"] + #dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"] + if any(term in text for term in terms_4): + return text + if len(text) <= 20: + for term in terms: + for date in dates: + if term in text and date in text: + text = f"{date}{term}合计" + return text + if len(text) <= 20: + for term in terms_2: + for date in dates: + if term in text and date in text: + text = f"{date}{term}" + return text + + import re + replacement_dict = { + '加:': '', + '减:': '', + '%' : '', + '其中:': '', + '实际': '', + '/': '', + '重述后':'', + '年末金额':'年末', + '比重增减':'同比增减', + '比例':'同比', + } + #针对整个text做替换 + def replace_all(text, replacements): + pattern = re.compile("|".join(map(re.escape, replacements.keys()))) + return pattern.sub(lambda match: replacements[match.group(0)], text) + text = replace_all(text, replacement_dict) + #单独出现12月31日时,就剔除掉 + pattern_year = r'(? 0: + return 'c' + elif len(re.findall(l_period, text)) > 0: + return 'l' + elif len(re.findall(bl_period, text)) > 0: + return 'bl' + else: + return 'c' + +def get_period_type_other(text, year): + l_year = f'{int(year)-1}' + bl_year = f'{int(year)-2}' + c_period = f'当期|本期|本报告期|报告期|本年|本期|{year}' + l_period = f'上年|上期|上年度|{l_year}' + bl_period = f'前年|{bl_year}' + + if len(re.findall(c_period, text)) > 0: + return 'c' + elif len(re.findall(l_period, text)) > 0: + return 'l' + elif len(re.findall(bl_period, text)) > 0: + return 'bl' + else: + return 'c_n' + +def get_start_period_type(text): + s_period = '期初|1月1日|年初' + + if len(re.findall(s_period, text)) > 0: + return '' + else: + return '0' + +def get_season_flag(text): + season_period = '第1季度|第2季度|第3季度|第4季度|一季度|二季度|三季度|四季度|1-3月|4-6月|7-9月|10-12月' + if len(re.findall(season_period, text)) > 0: + return '1' + else: + return '0' + +def get_percent_flag(text): + percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|变动比例|本年度比上年度增减|增减' + if len(re.findall(percent_word, text)) > 0: + return '1' + else: + return '0' + +def get_kf_flag(text): + kf_word = '扣非|扣除非经常性损益' + if len(re.findall(kf_word, text)) > 0: + return '1' + else: + return '0' + +def get_report_start(text): + kf_word = '报告期初|1月1日' + if len(re.findall(kf_word, text)) > 0: + return '1' + else: + return '0' + +def get_percent_growth(text): + percent_growth_word = '变动|本年比上年|比例同比增减|比例同比上升|比例同比下降|比例变化幅度|比例变动比例|比例本期比上年同期增减|比例本年比上年增减|比例同比变动|比例本期期末金额较上期期末变动比例|比率同比增减|比率同比上升|比率同比下降|比率变化幅度|比率变动比例|比率本期比上年同期增减|比率本年比上年增减|比率同比变动|比率本期期末金额较上期期末变动比例|占比同比增减|占比同比上升|占比同比下降|占比变化幅度|占比变动比例|占比本期比上年同期增减|占比本年比上年增减|占比同比变动|占比本期期末金额较上期期末变动比例|费用同比增减|费用同比上升|费用同比下降|费用变化幅度|费用变动比例|费用本期比上年同期增减|费用本年比上年增减|费用同比变动|费用本期期末金额较上期期末变动比例' + if len(re.findall(percent_growth_word, text)) > 0: + return '1' + else: + return '0' +def check_black_list(meta_measure, pdf_measure, black_array): + # 获取黑名单数据 + #black_array = fetch_black_list_data(cursor) + + for black in black_array: + black_meta = black.split(':')[0] + black_pdfs = black.split(':')[1].split(',') + if meta_measure==black_meta: + for pdf in black_pdfs: + if pdf_measure.find(pdf) >= 0: + return True + return False + +def check_black_list_old(meta_measure,pdf_measure): + # 判断指标名是否包含黑名单词 + #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日'] + black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计' + ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总' + ,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润' + ,'扣非净利润:净资产,净利率,年度公司' + ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除' + ,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除' + ,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除' + ,'非经常性损益:扣除非经常性损益' + ,'基本每股收益:稀释每股收益,发行新股' + ,'稀释每股收益:基本每股收益,发行新股' + ,'总资产:净资产','应收账款:应付账款,年以上,内,至,到' + ,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到' + ,'应付账款:应收账款,年以上,内,至,到' + ,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押' + ,'研发投入:比例,比率,占比,费用,占' + ,'资本化研发投入:比例,比率,占比,费用,占' + ,'资本化研发投入占比:金额,费用' + ,'研发投入占营业收入比例:金额,费用' + ,'上年年末:1月1日' + ,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股' + ,'期扣非加权平均净资产收益率:同比,年化,每股' + ,'加权平均净资产收益率同比变动:年化,每股' + ,'研发费用:制造,投入,直接,管理' + ,'应收账款:1-2年','货币资金:在途' + ,'当期:2023年1-6月,调整后' + ,'营业成本:营业总成本' + ,'长期借债:年内到期','研发投入:直接' + ,'第一季度:第二季度,第三季度,第四季度' + ,'第二季度:第一季度,第三季度,第四季度' + ,'第三季度:第二季度,第一季度,第四季度' + ,'第四季度:第二季度,第三季度,第一季度' + ,'研发费用:研发支出,研发投入','存货:跌价准备' + ,'费用:日常,付现','固定资产:改良,补助,投资'] + # current_period = f'当期:{report_year}年1-6月' + # black_array.append(current_period) + for black in black_array: + black_meta = black.split(':')[0] + black_pdfs = black.split(':')[1].split(',') + if meta_measure.find(black_meta) >= 0: + for pdf in black_pdfs: + if pdf_measure.find(pdf) >= 0: + return True + return False + +def check_white_list(meta_measure,pdf_measure): + white_array = ['基本每股收益:每股收益','加权平均净资产收益率同比变动:比','季度变动比例:比'] + for black in white_array: + black_meta = black.split(':')[0] + black_pdfs = black.split(':')[1].split(',') + if black_meta in meta_measure: + for pdf in black_pdfs: + if pdf_measure.find(pdf) < 0: + return True + return False + +def check_title_black_list(meta_measure,text_info): + # 判断指标名是否包含黑名单词 + black_array = ['营业收入:前五名,前5名,合计','营业成本:合计','财务费用:现金流','销售费用:现金流','管理费用:现金流','研发费用:现金流','非经常性损益:合计'] + for black in black_array: + black_meta = black.split(':')[0] + black_pdfs = black.split(':')[1].split(',') + if meta_measure.find(black_meta) >= 0: + for pdf in black_pdfs: + if text_info.find(pdf) >= 0: + return True + return False + +# 文本中数字的占比 +def under_non_alpha_ratio(text: str, threshold: float = 0.6): + + if len(text) == 0: + return False + + alpha_count = len([char for char in text if char.strip() and char.isalpha()]) + total_count = len([char for char in text if char.strip()]) + try: + ratio = alpha_count / total_count + return ratio <= threshold + except: + return False +def check_table_title_black_list(text,table_title_black_list):#report_year + #previous_year = int(report_year) - 1 + if table_title_black_list is None: + return False + if len(re.findall(table_title_black_list, text)) > 0: + return True + if re.search(r'上年度\s*$', text): + return True + return False +#通过关键词黑名单匹配表格上方的文本区域,提取需要过滤的表格 +def check_table_title_black_list_old(text,report_year):#report_year + previous_year = int(report_year) - 1 + table_title_black_list = f"""所有权或使用权受到限制的资产|持有待售资产|关联交易|未确认递延所得税资产明细|{previous_year}年度|{previous_year}年1-6月|自{previous_year}年1月1日至6月30日止期间|流动性风险|关联交易|账龄超过|流动风险|公司资产负债表|按账龄组合|线上直营|线上直销|公司现金流量表|公司利润表|应收账款|在建工程|固定资产|其他与筹资活动有关的现金|汇率风险|市场风险|主营业务收入|主营收入|其他收入|前五名|前5名|经营活动有关的现金|股份变动对最近一年和最近一期每股收益、每股净资产等财务指标的影响|合同产生的收入情况|子公司|参股公司|控股公司|分解信息|经营活动产生的现金|行业分类|产品分类|地区分类|业绩快报|销售渠道|调整情况说明|合同分类|计入当期损益的政府补助|股份变动对最近一年和最近一期|分部的财务信息|显示服务创收|线上销售情况|试运行销售|会计政策变更|品牌经营业务|工程施工业务|开发业务|制造业务|合营安排或联营企业中的权益|联营企业的主要财务信息|汇率及通货膨胀|与金融工具相关的风险|运营业务|B端业务|终止经营现金流量|终止经营|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|母公司|现金流量表补充|直营店店效情况|担保人2023年度未经审计的|外汇风险|公司各业务板块经营情况|报告期确认的包括在合同负债期初账面价值中的收入|资产受限情况|资产权利受限情况|内控自我评价报告|所有权或使用权受限资产|合并日被合并方资产、负债的账面价值|经营租赁资产|前5|前五|②|不属于现金及现金等价物的货币资金|按销售模式分|按产品类别分|按照销售区域|产品类别|销售模式|经销模式|关键管理人员|截至{previous_year}年6月30日止六个月期间|关联方提供的存款及贷款服务|报告期内各销售渠道的盈利情况|报告期内各地区的盈利情况|报告期内各产品的盈利情况|其他非流动负债|关联方提供的存款及贷款服务|自营销售分商品类别数据|组合计提|考核指标|不属于现金及现金等价物的货币资金|应收款项融资|本期计提、收回或转回的坏账准备情况|存货跌价准备|持有待售负债""" + + if len(re.findall(table_title_black_list, text)) > 0: + return True + if re.search(r'上年度\s*$', text): + return True + return False +#通过关键词黑名单匹配页面下方的文本区域,提取需要过滤的表格 + +def check_table_title_black_list_button(text,table_title_black_list): + + if table_title_black_list is None: + return False + + if len(re.findall(table_title_black_list, text)) > 0: + return True + if re.search(r'上年度\s*$', text): + return True + return False +def check_table_title_black_list_button_old(text): + + table_title_black_list = """公司资产负债表|公司现金流量表|公司利润表|主营业务收入|主营收入|其他收入|前五名|前5名|经营活动有关的现金|股份变动对最近一年和最近一期每股收益、每股净资产等财务指标的影响|合同产生的收入情况|子公司|参股公司|控股公司|分解信息|经营活动产生的现金|2022年度|行业分类|产品分类|地区分类|业绩快报|销售渠道|调整情况说明|合同分类|计入当期损益政府补助|股份变动对最近一年和最近一期|分部的财务信息|显示服务创收|线上销售情况|试运行销售|品牌经营业务|工程施工业务|开发业务|制造业务|合营安排或联营企业中的权益|联营企业的主要财务信息|汇率及通货膨胀|与金融工具相关的风险|运营业务|B端业务|终止经营现金流量|终止经营|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|不属于现金及现金等价物的货币资金|经营租赁资产|分地区|分产品|分行业|使用权受限资产|资产受限情况|经销模式|持续的第三层次公允价值计量项目,期初与期末账面价值间的调节信息及不可观察参数敏感|权利受限情况|应收款项融资|本期计提、收回或转回的坏账准备情况""" + + + if len(re.findall(table_title_black_list, text)) > 0: + return True + if re.search(r'上年度\s*$', text): + return True + return False +def check_table_title_black_list_measure(text): + #black_array = ['补充资料:研发费用,管理费用,财务费用' + # ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总' + #] + table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产|计息金融工具|坏账准备""" + if len(re.findall(table_title_black_list, text)) > 0: + return True + return False +#过滤原始指标中包含黑名单 +def check_pdf_measure_black_list(text): + pdf_measure_black_list = '股权变动前|股权变动后|含股份支付|境内|境外|调整前|有限公司|责任公司|其他|变更前|差异|同口径|调整金额' + if len(re.findall(pdf_measure_black_list, text)) > 0: + return True + if "其中:营业收入" in text: + return False + if "同比" in text and "额" in text: + #if text.find("同比") < text.find("额"): + if text.endswith("额"): + return True + return False + + +def check_pdf_measure(pdf_measure): + keywords_1 = [ + '2022年', '2023年', '2021年', '第一季度', '第二季度', '第三季度', '第四季度', '增减', '变动', '本期','同期', '当期', '报告期', '前年', + '上年', '上期', '本年', '1-3月', '4-6月', '7-9月', '10-12月' + ] + + keywords_2 = ['这里是一个测试文本'] + + contain_keyword_1 = any(keyword in pdf_measure for keyword in keywords_1) + contain_keyword_2 = any(keyword in pdf_measure for keyword in keywords_2) + #只有 未出现周期,同时出现了'调整后'才会删掉指标 + if not contain_keyword_1 and contain_keyword_2: + return True + return False +# def check_white_list(meta_measure,pdf_measure): +# # 判断指标名是否包含白名单词 +# black_array = ['营业收入:营业外收入,主营业务,营业总收入,扣除','归母净利润:净资产,净利率,扣除','扣非净利润:净资产,净利率','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用'] +# for black in black_array: +# black_meta = black.split(':')[0] +# black_pdfs = black.split(':')[1].split(',') +# if meta_measure.find(black_meta) >= 0: +# for pdf in black_pdfs: +# if pdf_measure.find(pdf) >= 0: +# return True +# return False +def check_line_text(line_text): + if line_text == 'PAGE': + return False + if line_text == '(续)': + return False + if line_text.endswith('(续)'): + return False + if line_text.endswith("年度财务报表") and "有限公司" in line_text: + return False + if len(line_text) < 20 and line_text.endswith("有限公司"): + return False + substrings = [ + '对内加快发展方式绿色转型、对外形成绿色生产和生活方式', + '可持续发展、创新发展;“8”是八大绿色行动', + '色新赋能、催生绿色新科技、筑牢绿色新支撑', + '接上表','续上表', + ] + for substring in substrings: + if substring in line_text: + return False + return True + +def pdf_text_flag(text : str): + if under_non_alpha_ratio(text): + return True + + if len(text) < 5: + return True + + if not re.findall(',|,|。|、|(|)',text): + return True + + if text.find('适用') != -1 and text.find('不适用') != -1: + return True + + if text.find('是') != -1 and text.find('否') != -1: + return True + + return False + +def get_change_rate_flag(text): + percent_word = '同比增减|同比上升|同比下降|变化幅度|变动比例|本期比上年同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例' + if len(re.findall(percent_word, text)) > 0: + return '1' + else: + return '0' + +def check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): + content_value = f"{table_num}_{table_index}" + measure_index_array = [] + select_measure_index_query = ''' + SELECT DISTINCT text FROM measure_parser_info_linetext WHERE file_id = %s AND type = 'measure_index' and content = %s + ''' + cursor_app.execute(select_measure_index_query, (file_id,content_value,)) + measure_index_records = cursor_app.fetchall() + for measure_index_record in measure_index_records: + measure_index_array.append(measure_index_record[0]) + black_array = ['补充资料:研发费用,管理费用,财务费用,销售费用' + ,'测试标题:测试指标' + ,'其他非流动负债:合同负债' + ,'应收款项融资:应收账款' + ,'本期计提、收回或转回的坏账准备情况:应收账款' + ,'筹资活动产生的各项负债变动情况:短期借款,长期借款' + ,'持有待售资产:固定资产' + ,'账龄超过 1 年或逾期的重要应付账款:应付账款' + ,'经营租赁资产:固定资产' + ,'计息金融工具:货币资金,短期借款,交易性金融资产' + ,'坏账准备:应收账款' + ] + for black in black_array: + black_meta = black.split(':')[0] + black_pdfs = black.split(':')[1].split(',') + #if measure_index_array.find(black_meta) >= 0: + #if black_meta in measure_index_array: + if any(black_meta in measure_index for measure_index in measure_index_array): + if any(pdf in pdf_measure for pdf in black_pdfs): + #for pdf in black_pdfs: + #if pdf in pdf_measure: + #if pdf_measure.find(pdf) >= 0: + return True + return False +def check_black_table_list(data): + black_array = ['补充资料:研发费用,管理费用,财务费用,销售费用', + #'补充目录:母公司' + ] + for black in black_array: + black_meta = black.split(':')[0] + black_pdfs = black.split(':')[1].split(',') + if any(black_meta in cell for row in data for cell in row): + print(data) + for pdf in black_pdfs: + data = [row for row in data if not any(pdf in cell for cell in row)] + return data + +if __name__ == '__main__': + + print(len('我是我')) + + # print(under_non_alpha_ratio('202水电费水电费水电费是的205月')) + # title = '母公司财务报表主要项目注释' + # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0: + # print('1') + # else: + # print('0') + + # print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额')) + # test = '2023年1-12月' + # print(get_period_type('上年度本期费用化研发投入')) + # print(get_period_type('费用化研发投入本年度')) + # vector_a = embed_with_str('第一季度营业收入') + # vector = vector_a.output["embeddings"][0]["embedding"] + + # vector_b = embed_with_str('营业收入第一季度') + # vector1 = vector_b.output["embeddings"][0]["embedding"] + + # similarity = cosine_similarity(vector, vector1) + # print(f"余弦相似度: {similarity}") + + # measure_data = [ + # '1,1,营业收入2023年金额,1003535799.51', + # '1,1,营业收入2022年金额,869401513.71', + # '1,1,营业收入变动比例,15.43%', + # '1,1,营业成本2023年金额,810779075.89', + # '1,1,营业成本2023年占营业收入的比重,80.79%', + # '1,1,营业成本2022年金额,702990363.57', + # '1,1,营业成本2022年占营业收入的比重,80.86%', + # '1,1,营业成本变动比例,15.33%', + # '1,1,毛利率2023年金额,19.21%', + # '1,1,毛利率2022年金额,19.14%', + # '1,1,销售费用2023年金额,34065464.60', + # '1,1,销售费用2023年占营业收入的比重,3.39%', + # '1,1,销售费用2022年金额,28038106.19', + # '1,1,销售费用2022年占营业收入的比重,3.22%', + # '1,1,销售费用变动比例,21.50%', + # '1,1,管理费用2023年金额,50807308.69', + # '1,1,管理费用2023年占营业收入的比重,5.06%', + # '1,1,管理费用2022年金额,38251704.48', + # '1,1,管理费用2022年占营业收入的比重,4.40%', + # '1,1,管理费用变动比例,32.82%', + # '1,1,研发费用2023年金额,35312198.23', + # '1,1,研发费用2023年占营业收入的比重,3.52%', + # '1,1,研发费用2022年金额,30081787.99', + # '1,1,研发费用2022年占营业收入的比重,3.46%', + # '1,1,研发费用变动比例,17.39%', + # '1,1,财务费用2023年金额,8015604.52', + # '1,1,财务费用2023年占营业收入的比重,0.80%', + # '1,1,财务费用2022年金额,5739677.85', + # '1,1,财务费用2022年占营业收入的比重,0.66%', + # '1,1,财务费用变动比例,39.65%', + # '1,1,信用减值损失2023年金额,-11873626.82', + # '1,1,信用减值损失2023年占营业收入的比重,-1.18%', + # '1,1,信用减值损失2022年金额,-8903293.61', + # '1,1,信用减值损失2022年占营业收入的比重,-1.02%', + # '1,1,信用减值损失变动比例,33.36%', + # '1,1,资产减值损失2023年金额,-2328729.46', + # '1,1,资产减值损失2023年占营业收入的比重,-0.23%', + # '1,1,资产减值损失2022年金额,-2285987.53', + # '1,1,资产减值损失2022年占营业收入的比重,-0.26%', + # '1,1,资产减值损失变动比例,1.87%', + # '1,1,其他收益2023年金额,17886048.88', + # '1,1,其他收益2023年占营业收入的比重,1.78%', + # '1,1,其他收益2022年金额,11025908.32', + # '1,1,其他收益2022年占营业收入的比重,1.27%', + # '1,1,其他收益变动比例,62.22%', + # '1,1,投资收益2023年金额,323361.47', + # '1,1,投资收益2023年占营业收入的比重,0.03%', + # '1,1,投资收益2022年金额,1119730.43', + # '1,1,投资收益2022年占营业收入的比重,0.13%', + # '1,1,投资收益变动比例,-71.12%', + # '1,1,公允价值变动收益2023年占营业收入的比重,0.00%', + # '1,1,公允价值变动收益2022年金额,10183.62', + # '1,1,公允价值变动收益2022年占营业收入的比重,0.00%', + # '1,1,公允价值变动收益变动比例,-100.00%', + # '1,1,资产处置收益2023年金额,12782544.48', + # '1,1,资产处置收益2023年占营业收入的比重,1.27%', + # '1,1,资产处置收益2022年金额,-59.56', + # '1,1,资产处置收益2022年占营业收入的比重,0.00%', + # '1,1,资产处置收益变动比例,21461726.06%', + # '1,1,汇兑收益2023年金额,0', + # '1,1,汇兑收益2023年占营业收入的比重,0%', + # '1,1,汇兑收益2022年金额,0', + # '1,1,汇兑收益2022年占营业收入的比重,0%', + # '1,1,汇兑收益变动比例,0%', + # '1,1,营业利润2023年金额,76175407.00', + # '1,1,营业利润2023年占营业收入的比重,7.59%', + # '1,1,营业利润2022年金额,63332601.81', + # '1,1,营业利润2022年占营业收入的比重,7.28%', + # '1,1,营业利润变动比例,20.28%', + # '1,1,营业外收入2023年金额,5788307.99', + # '1,1,营业外收入2023年占营业收入的比重,0.58%', + # '1,1,营业外收入2022年金额,1083997.19', + # '1,1,营业外收入2022年占营业收入的比重,0.12%', + # '1,1,营业外收入变动比例,433.98%', + # '1,1,营业外支出2023年金额,687271.68', + # '1,1,营业外支出2023年占营业收入的比重,0.07%', + # '1,1,营业外支出2022年金额,1554243.54', + # '1,1,营业外支出2022年占营业收入的比重,0.18%', + # '1,1,营业外支出变动比例,-55.78%', + # '1,1,净利润2023年金额,72975283.09', + # '1,1,净利润2023年占营业收入的比重,7.27%', + # '1,1,净利润2022年金额,57747603.98', + # '1,1,净利润2022年占营业收入的比重,6.64%', + # '1,1,净利润变动比例,26.37%', + # '1,1,税金及附加2023年金额,5170339.13', + # '1,1,税金及附加2023年占营业收入的比重,0.52%', + # '1,1,税金及附加2022年金额,1933753.49', + # '1,1,税金及附加2022年占营业收入的比重,0.22%', + # '1,1,税金及附加变动比例,167.37%', + # '1,1,所得税费用2023年金额,8301160.22', + # '1,1,所得税费用2023年占营业收入的比重,0.83%', + # '1,1,所得税费用2022年金额,5114751.48', + # '1,1,所得税费用2022年占营业收入的比重,0.59%', + # '1,1,所得税费用变动比例,62.30%', + # '1,1,少数股东损益2023年金额,-58350.22', + # '1,1,少数股东损益2023年占营业收入的比重,-0.01%', + # '1,1,少数股东损益2022年金额,-946.60', + # '1,1,少数股东损益2022年占营业收入的比重,0.00%', + # '1,1,少数股东损益变动比例,-6064.19%', + # '1,1,归属于母公司所有者的净利润2023年金额,73033633.31', + # '1,1,归属于母公司所有者的净利润2023年占营业收入的比重,7.28%', + # '1,1,归属于母公司所有者的净利润2022年金额,57748550.58', + # '1,1,归属于母公司所有者的净利润2022年占营业收入的比重,6.64%', + # '1,1,归属于母公司所有者的净利润变动比例,26.47%', + # '1,1,归属于少数股东的综合收益总额2023年金额,-58350.22', + # '1,1,归属于少数股东的综合收益总额2023年占营业收入的比重,-0.01%', + # '1,1,归属于少数股东的综合收益总额2022年金额,-946.60', + # '1,1,归属于少数股东的综合收益总额2022年占营业收入的比重,0.00%', + # '1,1,归属于少数股东的综合收益总额变动比例,-6064.19%', + # '1,1,归属于母公司所有者的综合收益总额2023年金额,73033633.31', + # '1,1,归属于母公司所有者的综合收益总额2023年占营业收入的比重,7.28%', + # '1,1,归属于母公司所有者的综合收益总额2022年金额,57748550.58', + # '1,1,归属于母公司所有者的综合收益总额2022年占营业收入的比重,6.64%', + # '1,1,归属于母公司所有者的综合收益总额变动比例,26.47%', + # '2,1,主营业务收入2023年,983698831.48', + # '2,1,主营业务收入2022年,854682261.31', + # '2,1,主营业务收入变动比例,15.10%', + # '2,1,其他业务收入2023年,19836968.03', + # '2,1,其他业务收入2022年,14719252.40', + # '2,1,其他业务收入变动比例,34.77%', + # '2,1,主营业务成本2023年,793604607.43', + # '2,1,主营业务成本2022年,690932741.27', + # '2,1,主营业务成本变动比例,14.86%', + # '2,1,其他业务成本2023年,17174468.46', + # '2,1,其他业务成本2022年,12057622.30', + # '2,1,其他业务成本变动比例,42.44%', + # '3,1,变压器营业收入,490028234.05', + # '3,1,变压器营业成本,402179824.08', + # '3,1,变压器毛利率,17.93%', + # '3,1,变压器营业收入比上年同期增减,16.22%', + # '3,1,变压器营业成本比上年同期增减,16.33%', + # '3,1,变压器毛利率比上年同期增减,减少0.07个百分点', + # '3,1,高低压成套开关设备营业收入,261342442.26', + # '3,1,高低压成套开关设备营业成本,206645237.99', + # '3,1,高低压成套开关设备毛利率,20.93%', + # '3,1,高低压成套开关设备营业收入比上年同期增减,-8.93%', + # '3,1,高低压成套开关设备营业成本比上年同期增减,-9.91%', + # '3,1,高低压成套开关设备毛利率比上年同期增减,增加0.86个百分点', + # '3,1,户外成套设备营业收入,198013248.27', + # '3,1,户外成套设备营业成本,157856817.84', + # '3,1,户外成套设备毛利率,20.28%', + # '3,1,户外成套设备营业收入比上年同期增减,62.25%', + # '3,1,户外成套设备营业成本比上年同期增减,65.30%', + # '3,1,户外成套设备毛利率比上年同期增减,减少1.47个百分点', + # '3,1,其他营业收入,54151874.93', + # '3,1,其他营业成本,44097195.98', + # '3,1,其他毛利率,18.57%', + # '3,1,其他营业收入比上年同期增减,39.68%', + # '3,1,其他营业成本比上年同期增减,36.10%', + # '3,1,其他毛利率比上年同期增减,增加2.14个百分点', + # '3,1,合计营业收入,1003535799.51', + # '3,1,合计营业成本,810779075.89', + # '3,2,东北地区营业收入,2425280.53', + # '3,2,东北地区营业成本,1427939.37', + # '3,2,东北地区毛利率,41.12%', + # '3,2,东北地区营业收入比上年同期增减,-69.51%', + # '3,2,东北地区营业成本比上年同期增减,-77.58%', + # '3,2,东北地区毛利率比上年同期增减,增加21.20个百分点', + # '3,2,华北地区营业收入,70542020.62', + # '3,2,华北地区营业成本,53044055.18', + # '3,2,华北地区毛利率,24.81%', + # '3,2,华北地区营业收入比上年同期增减,205.32%', + # '3,2,华北地区营业成本比上年同期增减,203.18%', + # '3,2,华北地区毛利率比上年同期增减,增加0.54个百分点', + # '3,2,华东地区营业收入,770352353.33', + # '3,2,华东地区营业成本,636803535.34', + # '3,2,华东地区毛利率,17.34%', + # '3,2,华东地区营业收入比上年同期增减,24.17%', + # '3,2,华东地区营业成本比上年同期增减,25.30%', + # '3,2,华东地区毛利率比上年同期增减,减少0.74个百分点', + # '3,2,华南地区营业收入,18509519.71', + # '3,2,华南地区营业成本,14496855.46', + # '3,2,华南地区毛利率,21.68%', + # '3,2,华南地区营业收入比上年同期增减,-57.08%', + # '3,2,华南地区营业成本比上年同期增减,-57.98%', + # '3,2,华南地区毛利率比上年同期增减,增加1.67个百分点', + # '3,2,华中地区营业收入,60588394.64', + # '3,2,华中地区营业成本,44559969.21', + # '3,2,华中地区毛利率,26.45%', + # '3,2,华中地区营业收入比上年同期增减,-51.24%', + # '3,2,华中地区营业成本比上年同期增减,-55.13%', + # '3,2,华中地区毛利率比上年同期增减,增加6.38个百分点', + # '3,2,西北地区营业收入,58618014.32', + # '3,2,西北地区营业成本,42844719.81', + # '3,2,西北地区毛利率,26.91%', + # '3,2,西北地区营业收入比上年同期增减,178.59%', + # '3,2,西北地区营业成本比上年同期增减,173.62%', + # '3,2,西北地区毛利率比上年同期增减,增加1.33个百分点', + # '3,2,西南地区营业收入,22500216.36', + # '3,2,西南地区营业成本,17602001.52', + # '3,2,西南地区毛利率,21.77%', + # '3,2,西南地区营业收入比上年同期增减,-23.74%', + # '3,2,西南地区营业成本比上年同期增减,-17.89%', + # '3,2,西南地区毛利率比上年同期增减,减少5.57个百分点', + # '3,2,合计营业收入,1003535799.51', + # '3,2,合计营业成本,810779075.89', + # '5,2,经营活动产生的现金流量净额2023年,-44713443.44', + # '5,2,经营活动产生的现金流量净额2022年,-53241071.45', + # '5,2,经营活动产生的现金流量净额变动比例,16.02%', + # '5,2,投资活动产生的现金流量净额2023年,-88649920.50', + # '5,2,投资活动产生的现金流量净额2022年,-94251741.15', + # '5,2,投资活动产生的现金流量净额变动比例,5.94%', + # '5,2,筹资活动产生的现金流量净额2023年,96607197.26', + # '5,2,筹资活动产生的现金流量净额2022年,210537586.22', + # '5,2,筹资活动产生的现金流量净额变动比例,-54.11%' + # ] + + # client = MilvusClient( + # uri="http://localhost:19530" + # ) + # vector_obj = embed_with_str('2023年营业收入') + # vector = vector_obj.output["embeddings"][0]["embedding"] + # data = [vector] + # res = client.search( + # collection_name="zzb_measure", # Replace with the actual name of your collection + # # Replace with your query vector + # data=data, + # limit=1, # Max. number of search results to return + # search_params={"metric_type": "COSINE", "params": {}}, # Search parameters + # output_fields=["measure_name","measure_value"] + # ) + + # # Convert the output to a formatted JSON string + # result = json.dumps(res, indent=4, ensure_ascii=False) + # print(result) + + # insert_measure_data(client, measure_data) + # text = '营业收入第一季度(1-3月份)' + # new_text = re.sub(r'([^)]*)', '',text) + # print(new_text) diff --git a/zzb_data_word/word_title.py b/zzb_data_word/word_title.py new file mode 100644 index 0000000..63c15e5 --- /dev/null +++ b/zzb_data_word/word_title.py @@ -0,0 +1,16 @@ + +import re +def get_parent_table_pages(title_array, file_id): + parent_table_pages_local = {} + parent_table_pages_local[file_id] = [] + print(f'{file_id}:{len(title_array)}') + for i in range(len(title_array)): + title_obj = title_array[i] + title = title_obj['data'] + if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 : + page_num = title_obj['index'] + parent_table_pages_local[file_id].append(page_num) + + parent_table_pages = parent_table_pages_local[file_id] + return parent_table_pages + diff --git a/zzb_data_word/zzb_logger.py b/zzb_data_word/zzb_logger.py new file mode 100644 index 0000000..f5c0777 --- /dev/null +++ b/zzb_data_word/zzb_logger.py @@ -0,0 +1,39 @@ +import time +import logging +import logging.handlers +import os + +# 如果日志文件夹不存在,则创建 +log_dir = "log-day" # 日志存放文件夹名称 +log_path = os.getcwd() + os.sep + log_dir +if not os.path.isdir(log_path): + os.makedirs(log_path) + +# logging初始化工作 +logging.basicConfig() + +# myapp的初始化工作 +applog = logging.getLogger(__name__) +applog.setLevel(logging.INFO) + +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) + +# 添加TimedRotatingFileHandler +# 定义一个1天换一次log文件的handler +# 保留3个旧log文件 +timefilehandler = logging.handlers.TimedRotatingFileHandler( + log_dir + os.sep + "sec.log", + when='D', + interval=1, + backupCount=3 +) +# 设置后缀名称,跟strftime的格式一样 +timefilehandler.suffix = "%Y-%m-%d_%H-%M-%S.log" +# timefilehandler.suffix = "%Y-%m-%d.log" + +formatter = logging.Formatter('%(asctime)s|%(name)-12s: %(levelname)-8s %(message)s') +console_handler.setFormatter(formatter) +timefilehandler.setFormatter(formatter) +applog.addHandler(timefilehandler) +applog.addHandler(console_handler) \ No newline at end of file