Initial commit with all project files including zzbdataprod

2025-09-11 10:13:03 +08:00 · 2025-09-11 10:13:03 +08:00 · 68069cd92b
commit 68069cd92b
64 changed files with 16502 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+logs/
+*.log
+pdf/
+zzb_data_prod/nohup.out
+zzb_data_prod/logs/
+zzb_data_prod/app.log
--- a/milvus_init.py
+++ b/milvus_init.py
@ -0,0 +1,33 @@
+from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
+# 连接到 B 服务器上的 Milvus
+# connections.connect(host='124.70.129.232', port='19530')# 测试服务器
+connections.connect(host='127.0.0.1', port='19530')# 测试服务器
+# # 获取集合列表
+utility.drop_collection("pdf_measure_v4")
+
+# 定义字段
+fields = [
+    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
+    FieldSchema(name="table_num", dtype=DataType.INT16),
+    FieldSchema(name="table_index", dtype=DataType.INT16),
+    FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
+    FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
+    FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
+    FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
+]
+
+# 定义集合的 schema
+schema = CollectionSchema(fields=fields, description="My Milvus collection")
+
+# 创建集合
+collection = Collection(name="pdf_measure_v4", schema=schema)
+
+collection = Collection("pdf_measure_v4")
+index_params = {
+    "index_type": "IVF_FLAT",
+    "metric_type": "COSINE",
+    "params": {"nlist": 128}
+}
+collection.create_index(field_name="vector", index_params=index_params)
+collection.load()
--- a/monitor_milvus.py
+++ b/monitor_milvus.py
@ -0,0 +1,125 @@
+import socket
+import subprocess
+import time
+from datetime import datetime
+import os
+import mysql.connector
+from zzb_data_prod.config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
+
+def get_time():
+    return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+
+
+def check_port(host, port):
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.settimeout(5)
+        result = sock.connect_ex((host, port))
+        sock.close()
+        return result == 0  # 返回布尔值，表示端口是否可用
+    except Exception as e:
+        print(f"[{get_time()}] 端口检测异常: {str(e)}")
+        return False
+
+
+def restart_service():
+    try:
+        subprocess.run(["bash", "/root/docker/milvus/standalone_embed.sh", "restart"])
+        print(f"[{get_time()}] milvus服务重启成功")
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"[{get_time()}] 服务重启失败: {str(e)}")
+        return False
+
+
+def start_application_process():
+    """启动8000端口对应的应用进程"""
+    try:
+        # 先尝试停止可能存在的旧进程
+        time.sleep(2)  # 给进程停止的时间
+        # 进入应用目录
+        # 启动新进程
+        subprocess.run(
+            ["bash", "/root/pdf_parser/restart_app.sh"],
+        )
+        print(f"[{get_time()}] 应用进程(8000端口)已成功启动")
+        return True
+            
+    except Exception as e:
+        print(f"[{get_time()}] 启动应用进程失败: {str(e)}")
+        return False
+
+
+def get_local_ip():
+    try:
+        # 创建一个 UDP 套接字
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        # 连接到一个外部地址（这里使用 Google 的公共 DNS 服务器）
+        s.connect(("8.8.8.8", 80))
+        # 获取本地套接字的 IP 地址
+        local_ip = s.getsockname()[0]
+    except Exception as e:
+        print(f"[{get_time()}] 获取内网 IP 失败: {e}")
+        local_ip = "127.0.0.1"  # 如果失败，返回本地回环地址
+    finally:
+        s.close()  # 关闭套接字
+    return local_ip
+
+def monitor_port_8000():
+    """监控8000端口，如果异常则启动应用进程"""
+    print(f"[{get_time()}] 检查8000端口状态...")
+    port_available = check_port("127.0.0.1", 8000)
+    
+    if not port_available:
+        print(f"[{get_time()}] 检测到8000端口异常，尝试启动应用进程...")
+        success = start_application_process()
+        
+        if success:
+            # 启动后检查是否成功
+            time.sleep(10)  # 等待应用启动
+            if check_port("127.0.0.1", 8000):
+                print(f"[{get_time()}] 应用进程启动成功，8000端口已正常")
+                # INSERT_YOUR_CODE
+                # 检查并修改数据库字段
+                try:
+                    
+                    conn = mysql.connector.connect(
+                        host=MYSQL_HOST,
+                        user=MYSQL_USER,
+                        password=MYSQL_PASSWORD,  # 请替换为实际密码
+                        database=MYSQL_DB   # 请替换为实际数据库名
+                    )
+                    cursor = conn.cursor()
+                    local_ip = get_local_ip()
+                    sql = f"update model_ip set status = 0 where ip = '{local_ip}:8000';"
+                    print(f"[{get_time()}] 执行sql: {sql}")
+                    cursor.execute(sql)
+                    conn.commit()
+                    print(f"[{get_time()}] 数据库字段已成功修改")
+                except Exception as e:
+                    print(f"[{get_time()}] 修改数据库字段失败: {str(e)}")
+                finally:
+                    try:
+                        cursor.close()
+                        conn.close()
+                    except:
+                        pass
+            else:
+                print(f"[{get_time()}] 应用进程启动后，8000端口仍未正常")
+    else:
+        print(f"[{get_time()}] 8000端口状态正常")
+        
+       
+
+if __name__ == '__main__':
+    print(f"[{get_time()}] 启动Milvus监控服务")
+    port_ok = check_port("127.0.0.1", 19530)
+    if not port_ok:
+        print("检测到Milvus服务异常，尝试重启...")
+        restart_service()
+
+    print(f"[{get_time()}] 启动 8000 端口监控服务")
+    # 开始监控8000端口，每60秒检查一次
+    monitor_port_8000()
+            
+     
--- a/restart_app.sh
+++ b/restart_app.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+
+
+# 切换到 /root/docker/milvus 目录
+cd /root/docker/milvus || { echo "无法进入目录 /root/docker/milvus"; exit 1; }
+# 运行 standalone_embed.sh restart 指令
+bash standalone_embed.sh restart
+
+
+#!/bin/bash
+
+# 目标目录（根据实际路径修改）
+TARGET_DIR="/root/pdf_parser/pdf"
+LOG_FILE="/root/pdf_parser/logs/pdf_clean.log"
+
+# 创建日志目录
+mkdir -p "$(dirname "$LOG_FILE")"
+
+# 带时间戳的日志函数
+log() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+# 检查目标目录是否存在
+if [ ! -d "$TARGET_DIR" ]; then
+    log "错误：目标目录不存在 $TARGET_DIR"
+    exit 1
+fi
+
+# 执行清理并记录
+log "开始清理PDF文件..."
+find "$TARGET_DIR" -iname "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do
+    log "删除文件: $file"
+    rm -f "$file"
+done
+
+log "清理完成，共删除 $(find "$TARGET_DIR" -iname "*.pdf" | wc -l) 个残留文件"
+
+# 设置工作目录和日志路径
+WORK_DIR="/root/pdf_parser/zzb_data_prod"
+LOG_FILE="$WORK_DIR/app.log"
+
+# 终止现有进程
+pids=$(ps -ef | grep app.py | grep -v grep | awk '{print $2}')
+if [ -n "$pids" ]; then
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] 正在停止现有进程: $pids"
+    kill -9 $pids
+else
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] 未找到正在运行的进程"
+fi
+
+# 进入工作目录
+cd $WORK_DIR || { echo "无法进入目录 $WORK_DIR"; exit 1; }
+
+# 启动服务
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] 启动服务..."
+nohup python3 app.py > $LOG_FILE 2>&1 &
+
+# 等待进程启动
+sleep 2
+
+# 检查进程状态
+new_pid=$(ps -ef | grep app.py | grep -v grep | awk '{print $2}')
+if [ -n "$new_pid" ]; then
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] 服务启动成功，进程ID: $new_pid"
+    echo "--------------------------------"
+    tail -n 10 $LOG_FILE
+else
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] 服务启动失败！"
+    echo "--------------------------------"
+    cat $LOG_FILE
+    exit 1
+fi
+
+
--- a/zzb_data_prod/.DS_Store
+++ b/zzb_data_prod/.DS_Store
--- a/zzb_data_prod/.Mil_unit.py.swp
+++ b/zzb_data_prod/.Mil_unit.py.swp
--- a/zzb_data_prod/.config.py.swp
+++ b/zzb_data_prod/.config.py.swp
--- a/zzb_data_prod/LICENSE
+++ b/zzb_data_prod/LICENSE
@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/zzb_data_prod/Mil_unit.py
+++ b/zzb_data_prod/Mil_unit.py
@ -0,0 +1,73 @@
+from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
+from config import MILVUS_CLIENT
+import time
+from datetime import datetime, timedelta
+from log_config import logger
+
+def create_partition_by_hour(file_id):
+    # 连接到 Milvus 服务器
+    connections.connect(uri=MILVUS_CLIENT)
+    # 获取集合
+    collection_name = "pdf_measure_v4"
+    collection = Collection(collection_name)
+    
+    # 创建当前id的分区
+    partition_name = f"partition_{file_id}"
+    if not collection.has_partition(partition_name):
+        collection.create_partition(partition_name)
+        logger.info(f"Created partition: {partition_name}")
+        partition = collection.partition(partition_name)
+        partition.load()
+
+    # 获取所有分区
+    partitions = collection.partitions
+    # 删除所有分区（除了默认分区和当前分区）
+    for partition in partitions:
+        name = partition.name
+        if name not in ["_default", partition_name]:  # 保留默认分区
+            pre_partition = collection.partition(name)
+            pre_partition.release()
+            collection.drop_partition(name)
+            logger.info(f"Partition '{name}' deleted.")
+    connections.disconnect("default")
+
+
+
+
+
+
+
+
+# from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
+# # 连接到 B 服务器上的 Milvus
+# # connections.connect(host='124.70.129.232', port='19530')# 测试服务器
+# connections.connect(host='1.94.60.103', port='19530')# 测试服务器
+# # # 获取集合列表
+# utility.drop_collection("pdf_measure_v4")
+#
+# # 定义字段
+# fields = [
+#     FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+#     FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
+#     FieldSchema(name="table_num", dtype=DataType.INT16),
+#     FieldSchema(name="table_index", dtype=DataType.INT16),
+#     FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
+#     FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
+#     FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
+#     FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
+# ]
+#
+# # 定义集合的 schema
+# schema = CollectionSchema(fields=fields, description="My Milvus collection")
+#
+# # 创建集合
+# collection = Collection(name="pdf_measure_v4", schema=schema)
+#
+# collection = Collection("pdf_measure_v4")
+# index_params = {
+#     "index_type": "IVF_FLAT",
+#     "metric_type": "COSINE",
+#     "params": {"nlist": 128}
+# }
+# collection.create_index(field_name="vector", index_params=index_params)
+# collection.load()
--- a/zzb_data_prod/pycache/Mil_unit.cpython-310.pyc
+++ b/zzb_data_prod/pycache/Mil_unit.cpython-310.pyc
--- a/zzb_data_prod/pycache/config.cpython-310.pyc
+++ b/zzb_data_prod/pycache/config.cpython-310.pyc
--- a/zzb_data_prod/pycache/db_service.cpython-310.pyc
+++ b/zzb_data_prod/pycache/db_service.cpython-310.pyc
--- a/zzb_data_prod/pycache/llm_service.cpython-310.pyc
+++ b/zzb_data_prod/pycache/llm_service.cpython-310.pyc
--- a/zzb_data_prod/pycache/log_config.cpython-310.pyc
+++ b/zzb_data_prod/pycache/log_config.cpython-310.pyc
--- a/zzb_data_prod/pycache/main.cpython-310.pyc
+++ b/zzb_data_prod/pycache/main.cpython-310.pyc
--- a/zzb_data_prod/pycache/pdf_title.cpython-310.pyc
+++ b/zzb_data_prod/pycache/pdf_title.cpython-310.pyc
--- a/zzb_data_prod/pycache/redis_service.cpython-310.pyc
+++ b/zzb_data_prod/pycache/redis_service.cpython-310.pyc
--- a/zzb_data_prod/pycache/utils.cpython-310.pyc
+++ b/zzb_data_prod/pycache/utils.cpython-310.pyc
--- a/zzb_data_prod/app.py
+++ b/zzb_data_prod/app.py
@ -0,0 +1,254 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import os
+import utils
+import queue
+from multiprocessing import Process,Manager
+import pdf_title
+import main
+import time
+import threading
+import config
+import requests
+import db_service
+import threading
+from Mil_unit import create_partition_by_hour
+from datetime import datetime, timedelta
+from log_config import logger
+
+app = FastAPI()
+cpu_count = 4
+job_queue = queue.Queue()
+
+# 定义请求体模型
+class FileItem(BaseModel):
+    file_path: str
+    file_id: str
+
+def run_job():
+    #判断是否有任务在执行 
+    if_run = True
+    
+    if job_queue.empty():
+        logger.info(f"job_queue为空: {file_path}")
+        if_run = False
+
+    if if_run:
+        job_config = job_queue.get()
+        page_list = []
+        file_path = job_config['file_path']
+        file_id = job_config['file_id']
+        job_status = True
+        continue_execution = True
+        try:
+            #下载pdf
+            start_time = time.time()
+            logger.info(f"开始启动文件解析任务: {file_path}")
+            if file_path.startswith('http'):
+                file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
+            try:
+                file_info = pdf_title.create_text_outline(file_path,file_id)
+            except Exception as e:
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
+                logger.info(f'通知任务状态url:{file_id}:{response.url}')
+                logger.info(f'通知任务状态任务:{file_id}:{response.text}')
+                logger.info(f"{file_id}运行失败: {e}")
+                continue_execution = False
+
+            # 
+            db_service.delete_MYSQL_DB_APP(file_id)
+            db_service.delete_MYSQL_DB(file_id)
+
+            if continue_execution:
+                parent_table_pages = file_info['parent_table_pages']
+                page_num = file_info['page_count']
+                if page_num < cpu_count:
+                    p_count = page_num
+                else :
+                    p_count = cpu_count
+
+                for i in range(p_count):
+                    page_list.append({
+                        'type': 'table',
+                        'page_num': file_info['split_parts']['table_split_parts'][i],
+                        # 'page_num': page_nums[i],
+                        'path': file_path,
+                        'file_id': file_id,
+                        'parent_table_pages': parent_table_pages,
+                        'page_count': file_info['page_count'],
+                        'tables_range': {},
+                    })
+
+            
+                # 通知开始解析
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
+                logger.info(f'通知pdf开始解析url:{file_id}:{response.url}')
+                logger.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
+                parser_start_time = time.time()
+                processes = []
+                time_dispatch_job = time.time()
+
+                for job_info in page_list:
+                    p = Process(target=main.dispatch_job, args=(job_info,))
+                    processes.append(p)
+                    p.start()
+          
+                logger.info(f'等待所有子任务完成，任务ID:{file_id}')
+                for p in processes:
+                    p.join()
+                logger.info(f'pdf解析任务完成任务完成，任务ID:{file_id}')
+                time_dispatch_job_end = time.time()
+                process_time = time_dispatch_job_end - time_dispatch_job
+                db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
+                parser_end_time = time.time()
+                logger.info(f"解析任务 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+                #这里做一步判断，看看是否还要继续。
+                if db_service.file_type_check(file_id):
+                    logger.info(f"文本较真表格生成已结束")
+                else:
+                    # 通知抽取指标
+                    response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
+                    logger.info(f'通知开始抽取指标url:{file_id}:{response.url}')
+                    logger.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
+
+                    parser_start_time = time.time()
+                    logger.info(f'开始表格指标抽取，任务ID:{file_id}')
+                    time_start = time.time()
+
+
+                    # 获取当前时间
+                   
+                    partition_name = f"partition_{file_id}"
+                    # 判断是否创建新的分区
+                    create_partition_by_hour(file_id)
+                    time.sleep(10)
+                    # 判断是否为3季报
+
+                    if db_service.file_type_check_v2(file_id) == 3:
+                        main.start_table_measure_job(file_id,partition_name)
+                        time_start_end = time.time()
+                        process_time = time_start_end - time_start
+                        db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
+                        logger.info(f'表格指标抽取完成，任务ID:{file_id}')
+                        parser_end_time = time.time()
+                        logger.info(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+
+                        logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
+                        time_update = time.time()
+                        main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
+
+                        logger.info(f'归一化完成任务ID:{file_id}')
+                        end_time = time.time()
+                        logger.info(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
+                        time_update_end = time.time()
+                        process_time = time_update_end - time_update
+                        db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
+                    # 不是三季报就直接按照年报和半年报走
+                    else:
+                        main.start_table_measure_job(file_id,partition_name)
+                        time_start_end = time.time()
+                        process_time = time_start_end - time_start
+                        db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
+                        logger.info(f'表格指标抽取完成，任务ID:{file_id}')
+                        parser_end_time = time.time()
+                        logger.info(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+
+                        logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
+                        time_update = time.time()
+                        main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
+
+                        logger.info(f'归一化完成任务ID:{file_id}')
+                        end_time = time.time()
+                        logger.info(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
+                        time_update_end = time.time()
+                        process_time = time_update_end - time_update
+                        db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
+                #通知任务完成
+                response_time = time.time()
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
+                logger.info(f'通知任务状态url:{file_id}:{response.url}')
+                logger.info(f'通知任务状态任务:{file_id}:{response.text}')
+                response_time_end = time.time()
+                process_time = response_time_end - response_time
+                db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
+        except Exception as e:
+            #通知任务完成
+            response_time = time.time()
+            if "integer division or modulo by zero" in str(e):
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id, 'status': 4})
+            else:
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id, 'status': 4})
+            #response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
+            response_time_end = time.time()
+            process_time = response_time_end - response_time
+            db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
+            logger.info(f'通知任务状态url:{file_id}:{response.url}')
+            logger.info(f'通知任务状态任务:{file_id}:{response.text}')
+            logger.info(f"Response status code: {response.status_code}")
+            logger.info(f"{file_id}运行失败: {e}")
+        finally:
+            logger.info(f"任务 {file_id} 完成，运行状态：{job_status}")
+
+            #pdf_company_0824.name_code_fix(file_id,file_path)
+            #print('公司名与编码填充完毕')
+    else:
+        logger.info(f"有任务运行中，需要等待.....")
+
+def parse_pdf_route(fileItem: FileItem):
+    
+    # 创建一个队列,保证每次只执行一个文件解析任务
+    job_queue.put({
+        'file_path' : fileItem.file_path,
+        'file_id' : fileItem.file_id
+    })
+    logger.info(f"增加 {fileItem.file_id} 到队列.")
+
+    threading.Thread(target=run_job, args=()).start()
+
+    return {"success": True, "msg": "文件解析开始"}
+
+app.post("/parser/start",
+        tags=["parser"],
+        summary="解析Pdf文件", 
+        )(parse_pdf_route)
+
+def get_local_ip():
+    try:
+        # 创建一个 UDP 套接字
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        # 连接到一个外部地址（这里使用 Google 的公共 DNS 服务器）
+        s.connect(("8.8.8.8", 80))
+        # 获取本地套接字的 IP 地址
+        local_ip = s.getsockname()[0]
+    except Exception as e:
+        logger.info(f"获取内网 IP 失败: {e}")
+        local_ip = "127.0.0.1"  # 如果失败，返回本地回环地址
+    finally:
+        s.close()  # 关闭套接字
+    return local_ip
+
+# 运行 FastAPI 应用
+if __name__ == "__main__":
+    # 服务器启动服务
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=config.PORT)
+    try:
+        # 获取内网IP
+        ip = get_local_ip()
+        logger.info(f"内网IP地址: {ip}")
+        # 假设 config.NOTIFY_ADDR 是一个字符串，我们可以使用 rpartition 方法来替换最后一个 / 后面的值
+        url = config.NOTIFY_ADDR.rpartition('/')[0] + '/restart?address'
+        address = f"{ip}:{config.PORT}"
+        logger.info(address)
+        response = requests.get(url, params={'address':address})
+        logger.info(f"Response status code: {response.status_code}")
+    except KeyboardInterrupt:
+        logger.info("Shutdown server")
+
+    # 本地调试任务
+    # job_queue.put({
+    #  'file_path' : '1.pdf',
+    #  'file_id' : '2222222'
+    #  })
+    
+    # run_job()
--- a/zzb_data_prod/combined_v61.pdf
+++ b/zzb_data_prod/combined_v61.pdf
--- a/zzb_data_prod/config.py
+++ b/zzb_data_prod/config.py
@ -0,0 +1,24 @@
+MILVUS_CLIENT='http://127.0.0.1:19530'
+MILVUS_HOST = '127.0.0.1'
+MILVUS_PORT = 19530
+MYSQL_HOST = '10.127.2.207'
+MYSQL_PORT = 3306
+MYSQL_USER = 'financial_prod'
+MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
+MYSQL_DB = 'financial_report_prod'
+NOTIFY_ADDR = 'http://10.127.2.202:8100/api/tenant/report/notify'
+FILE_PATH = '/root/pdf_parser/pdf/'
+REDIS_HOST = '10.127.2.209'
+REDIS_PORT = 6379
+REDIS_PASSWORD = 'dMrt4kmwiW6LDJXy'
+PORT = 8000
+MEASURE_COUNT = 4
+
+
+MYSQL_HOST_APP = '10.127.2.208'
+MYSQL_PORT_APP = 3306
+MYSQL_USER_APP = 'financial_report'
+MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
+MYSQL_DB_APP = 'financial_report_prod'
+# api_key = 'sk-2c695c8bdc5c4bb5b48feffa5d9e1de7'
+api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
--- a/zzb_data_prod/db_service.py
+++ b/zzb_data_prod/db_service.py
--- a/zzb_data_prod/delete_pdf.py
+++ b/zzb_data_prod/delete_pdf.py
@ -0,0 +1,84 @@
+#报错提示
+import paramiko
+import time
+import threading
+
+# 执行命令的函数
+def execute_commands_on_server(hostname, username, password, host):
+    try:
+        # 连接到服务器
+        client = paramiko.SSHClient()
+        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        client.connect(hostname=hostname, username=username, password=password)
+
+        # 执行命令
+        shell = client.invoke_shell()
+        #启动docker
+        shell.send("cd /root/pdf_parser/pdf\n")
+        time.sleep(1)
+        shell.send("rm -f *.pdf\n")
+        time.sleep(10)
+        shell.send("rm -f *.PDF\n")
+        time.sleep(10)
+        # 读取输出
+        output = shell.recv(2048).decode()
+        print(f"Output from {hostname}:\n{output}")
+
+    except paramiko.SSHException as e:
+        print(f"SSH connection error with {hostname}: {e}")
+
+    finally:
+        client.close()
+
+# 创建线程函数
+def thread_function(server):
+    execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
+
+# 服务器列表
+# servers = [
+#     {'hostname': 'server1.example.com', 'username': 'user1', 'password': 'pass1', 'host': 'host1'},
+#     {'hostname': 'server2.example.com', 'username': 'user2', 'password': 'pass2', 'host': 'host2'},
+#     # 添加更多服务器
+# ]
+servers = [
+    #{'hostname': '124.70.129.232', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器'},
+    # {'hostname': '1.94.179.121', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器'},#废弃
+
+#旧10台
+    {'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器1'},
+    {'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器2'},
+    {'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器3'},
+    {'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器4'},
+   
+    {'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器5'},
+    {'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器6'},#都往这里存
+    {'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器7'},
+    {'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器8'},
+    {'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器9'},
+    {'hostname': '123.60.28.83', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器10'},
+#新10台
+    {'hostname': '192.168.0.19', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器1'},
+    {'hostname': '192.168.0.53', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器2'},
+    {'hostname': '192.168.0.150', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器3'},
+    {'hostname': '192.168.0.210', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器4'},
+   
+    {'hostname': '192.168.0.129', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器5'},
+    {'hostname': '192.168.0.24', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器6'},
+    {'hostname': '192.168.0.250', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器7'},
+    {'hostname': '192.168.0.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器8'},
+    {'hostname': '192.168.0.86', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器9'},
+    {'hostname': '192.168.0.88', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器10'},
+]
+
+# 创建并启动线程
+threads = []
+for server in servers:
+    thread = threading.Thread(target=thread_function, args=(server,))
+    threads.append(thread)
+    thread.start()
+
+# 等待所有线程完成
+for thread in threads:
+    thread.join()
+
+print("All commands executed.")
--- a/zzb_data_prod/excel.py
+++ b/zzb_data_prod/excel.py
@ -0,0 +1,121 @@
+import pandas as pd
+import json
+import utils
+from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
+import mysql.connector
+
+# 读取 Excel 文件
+df = pd.read_excel('/Users/zhengfei/Desktop/ttt.xlsx', header=0)
+
+# 将 DataFrame 转换为字典列表
+data_list = df.to_dict(orient='records')
+
+period_exra_arr =['当期:本期,本报告期,报告期,报告期内,本年度,本期发生额,2023年,2023年全年,2023年金额','上年同期:上期,上年度,2022年,2022年全年,2022年金额','前年同期:2021年,2021年全年,2021年金额','同比变动:同比增减,同比上升,同比下降,变化幅度,变动比例,本期比上年同期增减,本年比上年增减','报告期末:本报告期末,期末,期末数,期末金额,2023年年末,2023年12月31日','年初至报告期末:上年年末,上年末,2022年年末,2022年12月31日','报告期初:期初,期初数,期初金额,2023年1月1日','当期第一季度:第一季度,1-3月,第一季度（1-3月）,2023年第一季度','当期第二季度:第二季度,4-6月,第二季度（4-6月）,2023年第二季度','当期第三季度:第三季度,7-9月,第三季度（7-9月）,2023年第三季度','当期第四季度:第四季度,10-12月,第四季度（10-12月）,2023年第四季度']
+
+year = 2023
+
+conn = mysql.connector.connect(
+    host = MYSQL_HOST,
+    user = MYSQL_USER,
+    password = MYSQL_PASSWORD,
+    database = MYSQL_DB
+)
+
+# 创建一个cursor对象来执行SQL语句
+cursor = conn.cursor()
+
+# insert_query = '''
+#                 INSERT INTO measure_create_config
+#                 (config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list) 
+#                 VALUES (%s, %s, %s, %s, %s, %s)
+#                 '''
+
+# for data in data_list:
+#     show_measure = str(data['指标'])
+#     same_mean_measure = str(data['同义表述'])
+#     period_measure = str(data['周期'])
+#     change_measure = str(data['变动'])
+#     black_list = str(data['黑名单词'])
+#     config_id = utils.get_md5(show_measure)
+#     insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
+#     cursor.execute(insert_query, insert_query_data)
+#     conn.commit()
+
+# 读取 Excel 文件
+# df_period = pd.read_excel('/Users/zhengfei/Desktop/period.xlsx', header=0)
+
+# # 将 DataFrame 转换为字典列表
+# period_list = df_period.to_dict(orient='records')
+
+# period_insert_query = '''
+#                 INSERT INTO measure_create_period
+#                 (period_name, same_mean_period) 
+#                 VALUES (%s, %s)
+#                 '''
+
+# for data in period_list:
+#     period_name = str(data['标准表述'])
+#     same_mean_period = str(data['同义表述'])
+    
+#     insert_query_data = (period_name, same_mean_period)
+#     cursor.execute(period_insert_query, insert_query_data)
+#     conn.commit()
+
+data_query = '''
+                SELECT * FROM measure_create_config where delete_status = 0
+            '''
+period_query = '''
+                SELECT * FROM measure_create_period
+            '''
+
+cursor.execute(data_query)
+data_list = cursor.fetchall()
+
+cursor.execute(period_query)
+period_list = cursor.fetchall()
+
+for data in data_list:
+    config_id = data[0]
+    show_measure = data[1]
+    same_mean_measure = data[2]
+    period_measure = data[3]
+    change_measure = data[4]
+    same_mean_measure_arr = []
+    period_measure_arr = []
+    change_measure_arr = []
+    if same_mean_measure != 'nan' :
+        same_mean_measure_arr = same_mean_measure.split(',')
+        
+    if period_measure != 'nan' :
+        period_measure_arr = period_measure.split(',')
+    if change_measure != 'nan' :
+        change_measure_arr = change_measure.split(',')
+    
+    for c in change_measure_arr:
+        period_measure_arr.append(c)
+
+    for x in period_measure_arr:
+        if x in change_measure_arr:
+            show_name = show_measure+x
+        else:
+            show_name = x+show_measure
+        for y in same_mean_measure_arr:
+            if x in change_measure:
+                parser_name = y+x
+            else:
+                parser_name = x+y
+            
+            print(f'{show_name},{parser_name}')
+            for p in period_list:
+                period_exra_name = p[0]
+                period_exra_value = p[1]
+                if x.find(period_exra_name) != -1:
+                    for v in period_exra_value.split(','):
+                        if x in change_measure:
+                            parser_name = y + x.replace(period_exra_name, v)
+                        else:
+                            parser_name = x.replace(period_exra_name, v) + y
+                        print(f'{show_name},{parser_name}')
+
+cursor.close()
+conn.close()
--- a/zzb_data_prod/insert_redis.py
+++ b/zzb_data_prod/insert_redis.py
@ -0,0 +1,246 @@
+import pandas as pd
+import mysql.connector
+import utils
+#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
+import re
+import redis
+
+def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
+    # 读取第一个 Excel 文件
+    df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
+    # 将 DataFrame 转换为字典列表
+    data_list = df.to_dict(orient='records')
+
+    # 连接到 MySQL 数据库
+    conn = mysql.connector.connect(
+        host=MYSQL_HOST,
+        user=MYSQL_USER,
+        password=MYSQL_PASSWORD,
+        database=MYSQL_DB
+    )
+    cursor = conn.cursor()
+
+    # 插入数据到 measure_create_config 表
+    insert_query = '''
+        INSERT INTO measure_create_config
+        (config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list) 
+        VALUES (%s, %s, %s, %s, %s, %s)
+    '''
+    for data in data_list:
+        show_measure = str(data['指标'])
+        same_mean_measure = str(data['同义表述'])
+        period_measure = str(data['周期'])
+        change_measure = str(data['变动'])
+        black_list = str(data['黑名单词'])
+        config_id = utils.get_md5(show_measure)
+        insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
+        cursor.execute(insert_query, insert_query_data)
+        conn.commit()
+
+    # 读取第二个 Excel 文件
+    df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
+    # 将 DataFrame 转换为字典列表
+    period_list = df_period.to_dict(orient='records')
+
+    # 插入数据到 measure_create_period 表
+    period_insert_query = '''
+        INSERT INTO measure_create_period
+        (period_name, same_mean_period) 
+        VALUES (%s, %s)
+    '''
+    for data in period_list:
+        period_name = str(data['标准表述'])
+        same_mean_period = str(data['同义表述'])
+        insert_query_data = (period_name, same_mean_period)
+        cursor.execute(period_insert_query, insert_query_data)
+        conn.commit()
+
+    # 查询数据库
+    data_query = '''
+        SELECT * FROM measure_create_config WHERE delete_status = 0
+    '''
+    period_query = '''
+        SELECT * FROM measure_create_period
+    '''
+
+    cursor.execute(data_query)
+    data_list = cursor.fetchall()
+
+    cursor.execute(period_query)
+    period_list = cursor.fetchall()
+
+    # 输出到文件
+    with open(output_file_path, 'w', encoding='utf-8') as file:
+        for data in data_list:
+            config_id = data[0]
+            show_measure = data[1]
+            same_mean_measure = data[2]
+            period_measure = data[3]
+            change_measure = data[4]
+            same_mean_measure_arr = []
+            period_measure_arr = []
+            change_measure_arr = []
+
+            if same_mean_measure != 'nan':
+                same_mean_measure_arr = same_mean_measure.split(',')
+                same_mean_measure_arr.append(show_measure)
+            if period_measure != 'nan':
+                period_measure_arr = period_measure.split(',')
+            if change_measure != 'nan':
+                change_measure_arr = change_measure.split(',')
+
+            for c in change_measure_arr:
+                period_measure_arr.append(c)
+
+            for x in period_measure_arr:
+                if x in change_measure_arr:
+                    show_name = show_measure + x
+                else:
+                    show_name = x + show_measure
+                for y in same_mean_measure_arr:
+                    if x in change_measure:
+                        parser_name = y + x
+                    else:
+                        parser_name = x + y
+                    
+                    file.write(f'{show_name},{parser_name}\n')
+                    
+                    for p in period_list:
+                        period_exra_name = p[0]
+                        period_exra_value = p[1]
+                        if period_exra_name in x:
+                            for v in period_exra_value.split(','):
+                                if x in change_measure:
+                                    parser_name = y + x.replace(period_exra_name, v)
+                                else:
+                                    parser_name = x.replace(period_exra_name, v) + y
+                                file.write(f'{show_name},{parser_name}\n')
+
+    cursor.close()
+    conn.close()
+
+
+# 根据老指标配置表生成新指标配置表
+def create_new_config(conn, cursor, table_name,old_year,new_year):
+
+    select_query = f'''
+                SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
+                FROM {table_name}
+                WHERE year = '{old_year}'
+                '''
+    cursor.execute(select_query)
+    data_list = cursor.fetchall()
+
+    insert_query = f'''
+                INSERT INTO {table_name} 
+                (measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year) 
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+                '''
+    for data in data_list:
+        ori_measure_name = data[3]
+        if re.match(r'^\d{4}',ori_measure_name):
+            year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
+            year += 1
+            ori_measure_name = str(year) + ori_measure_name[4:]
+        insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
+        cursor.execute(insert_query, insert_data)
+    conn.commit()
+
+def measure_config_to_db(conn, cursor, table_name):
+    year_list = ["2021","2022","2023","2024","2025"]
+    for year in year_list:
+        insert_query = f'''
+                    INSERT INTO {table_name}
+                    (measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year) 
+                    VALUES (%s, %s, %s, %s,%s,%s,%s)
+                    '''
+        check_query = f'''
+                    SELECT ori_measure_id FROM {table_name} 
+                    WHERE year = '{year}'
+                    '''
+        # 新增指标
+        lines = [
+                f"当期营业收入,{year}年第一季度营业收入",
+                f"当期归母净利润,{year}年第一季度归母净利润",
+                f"当期扣非净利润,{year}年第一季度扣非净利润",
+                f"当期经营活动现金流净额,{year}年第一季度经营活动现金流净额",
+                f"当期筹资活动现金流净额,{year}年第一季度筹资活动现金流净额",
+                f"当期投资活动现金流净额,{year}年第一季度投资活动现金流净额",
+                f"当期非经常性损益,{year}年第一季度非经常性损益",
+                f"当期基本每股收益,{year}年第一季度基本每股收益",
+                f"当期稀释每股收益,{year}年第一季度稀释每股收益",
+                f"当期加权平均净资产收益率,{year}年第一季度加权平均净资产收益率",
+                f"当期扣非加权平均净资产收益率,{year}年第一季度扣非加权平均净资产收益率",
+                f"当期营业成本 ,{year}年第一季度营业成本",
+                f"当期销售费用,{year}年第一季度销售费用",
+                f"当期管理费用,{year}年第一季度管理费用",
+                f"当期财务费用,{year}年第一季度财务费用",
+                f"当期研发费用,{year}年第一季度研发费用"]
+        # 打印每一行
+        for line in lines:
+            config_list = line.strip().split(',')
+            measure = config_list[0]
+            ori_measure = config_list[1]
+            ori_measure_id = utils.get_md5(ori_measure)
+            
+            # 判断数据库中是否有数据
+            cursor.execute(check_query)
+            check_records = cursor.fetchall()
+            if any(record[0] == ori_measure_id for record in check_records):
+                continue
+            
+            data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
+            cursor.execute(insert_query, data_to_insert)
+            conn.commit()
+
+def insert_measure_vector(conn,cursor,table_name):
+    from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
+    redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
+    # 执行SQL语句，更新数据
+    select_query = f'''
+                SELECT ori_measure_id,ori_measure_name FROM {table_name}
+                '''
+    cursor.execute(select_query)
+    records = cursor.fetchall()
+    print(f"总计{len(records)}条数据")
+    for record in records:
+        if redis_client.hexists('measure_config', record[0]):
+            measure_vector = redis_client.hget('measure_config', record[0])
+        else:
+            print('新增指标',record[1])
+            vector_obj = utils.embed_with_str(record[1])
+            measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
+
+        redis_client.hset('measure_config', record[0], measure_vector)
+    redis_client.close()
+    conn.close()
+#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
+if __name__ == "__main__":
+    #需要先清空本地数据库的  measure_create_config 和   measure_create_period   表
+
+    # process_excel_and_db(
+    #     'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
+    #     'F:\\11_pdf\\period_1.xlsx',#period文件
+    #     'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
+    # )
+    from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
+    conn = mysql.connector.connect(
+        host=MYSQL_HOST_APP,
+        user=MYSQL_USER_APP,
+        password=MYSQL_PASSWORD_APP,
+        database=MYSQL_DB_APP
+    )
+    cursor = conn.cursor()
+    #file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
+    
+
+    
+    # 更新第一季度的measure_vector
+    table_name = 'measure_config_first_quarter'
+    # 写入mysql
+    measure_config_to_db(conn, cursor, table_name)
+    # create_new_config(conn, cursor, table_name,'2024','2025')
+    # 插入redies
+    insert_measure_vector(conn,cursor,table_name)
+
+
--- a/zzb_data_prod/llm_service.py
+++ b/zzb_data_prod/llm_service.py
@ -0,0 +1,98 @@
+#coding=utf-8
+
+import random
+from http import HTTPStatus
+from dashscope import Generation
+from datetime import datetime
+
+# 文本和表格数据给大模型，返回大模型抽取原始指标列表
+def get_measure_from_llm(user_prompt):
+    """
+    :return: 文本和表格数据给大模型，返回大模型抽取原始指标列表
+    """
+    llm_measure_list = []
+    system_prompt = '''
+            你是一个优秀的金融分析师，从给定的数据报告中自动提取以下关键财务指标。指标包括：
+            2023年营业收入
+            2022年营业收入
+            2021年营业收入
+            2023年第一季度营业收入
+            2023年第二季度营业收入
+            2023年第三季度营业收入
+            2023年第四季度营业收入
+            营业收入同比变动
+            2023年归母净利润
+            2022年归母净利润
+            2021年归母净利润
+            2023年第一季度归母净利润
+            2023年第二季度归母净利润
+            2023年第三季度归母净利润
+            2023年第四季度归母净利润
+            归母净利润同比变动
+            2023年扣非净利润
+            2022年扣非净利润
+            2021年扣非净利润
+            2023年第一季度扣非净利润
+            2023年第二季度扣非净利润
+            2023年第三季度扣非净利润
+            2023年第四季度扣非净利润
+            扣非净利润同比变动
+            2023年经营活动现金流净额
+            2022年经营活动现金流净额
+            2021年经营活动现金流净额
+            经营活动现金流净额同比变动
+            2023年筹资活动现金流净额
+            2022年筹资活动现金流净额
+            2021年筹资活动现金流净额
+            2023年投资活动现金流净额
+            2022年投资活动现金流净额
+            2021年投资活动现金流净额
+            2023年非经常性损益
+            2022年非经常性损益
+            2021年非经常性损益
+            2023年基本每股收益
+            2022年基本每股收益
+            2021年基本每股收益
+            2023年稀释每股收益
+            2022年稀释每股收益
+            2021年稀释每股收益
+            2023年加权平均净资产收益率
+            2022年加权平均净资产收益率
+            2021年加权平均净资产收益率
+            2023年扣非加权平均净资产收益率
+            2022年扣非加权平均净资产收益率
+            2021年扣非加权平均净资产收益率
+            <数据报告>
+            <user_prompt>
+            </数据报告>
+            '''
+    system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
+    response = Generation.call(
+                                model='qwen-plus',
+                                prompt = system_prompt,
+                                seed=random.randint(1, 10000),
+                                top_p=0.8,
+                                result_format='message',
+                                enable_search=False,
+                                max_tokens=1500,
+                                temperature=0.85,
+                                repetition_penalty=1.0
+                            )
+    if response.status_code == HTTPStatus.OK:
+        result = response['output']['choices'][0]['message']['content']
+        llm_measure_list = result.split('\n')
+        return llm_measure_list
+    else:
+        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+            response.request_id, response.status_code,
+            response.code, response.message
+        ))
+
+    return "llm_error"
+
+if __name__ == '__main__':
+    user_prompt = '''
+                二、 经营情况回顾 (一) 经营计划 2023 年，在国际环境复杂多变以及全球经济依旧下行的形势下，公司严格按照既定发展战略和经营计划，狠抓落实，迎难而上，业务经营整体保持稳定，如期完成全年既定经营目标。在全体职员的共同努力下，公司的营业收入、净利润等各项指标再创历史新高，营业收入较上年同期实现15.43%的增长，归属于上市公司股东的净利润较上年同期实现 26.47%的增长。 1、财务状况 报告期末，公司资产总额为 1,473,271,310.23 元，增幅为 19.17%，主要系：一方面随着销售规模的不断增长，公司应收账款及合同资产等流动资产增幅较大，另一方面，为解决基于销售规模扩大引致的产能跟不上的瓶颈，公司上马扩产建设项目，导致在建工程、固定资产等非流动资产增幅较报告期末公司负债总额为 800,619,067.70 元，增幅为 26.12%，主要系随着销售规模增加、工程建设项目推进、固定资产购置等，公司采购数额大幅增加，公司通过银行借款等方式筹集资金，导致长短期贷款期末余额增幅较大。 报告期末，归属于上市公司股东的净资产为 670,316,339.35 元，增幅为 11.45%，主要系报告期内经营积累。 2、经营成果 报告期内，公司实现营业收入 1,003,535,799.51 元，增幅为 15.43%。主要系公司本期持续优化生产经营，大力推进产品研发和创新，抓住“双碳”政策以及“能效”提升产生的市场需求旺盛的有利时机，且随着公司北交所上市，产品品牌效应凸显，产能增加，订单获取能力增强，变压器及户外成套设备销售增长较多。 营业成本为 810,779,075.89 元，增幅为 15.33%，主要系报告期内销售增长及主要原材料价格变动所致。归属于上市公司股东的净利润为 73,033,633.31 元，增幅为 26.47%，主要系：1）公司持续优化生产经营，大力推进产品研发和创新，抓住“双碳”政策以及“能效”提升产生的市场需求旺盛的有利时机，生产和销售均呈稳定增长；2）本期处置开源路 1-1 号土地及建筑物及其他附属物等，结转资产处置收益同比增加。 
+            '''
+    measure_list = get_measure_from_llm(user_prompt)
+    print(measure_list)
--- a/zzb_data_prod/log_config.py
+++ b/zzb_data_prod/log_config.py
@ -0,0 +1,51 @@
+import logging
+import os
+from logging.handlers import RotatingFileHandler
+
+def setup_logging():
+    # 创建logs目录（如果不存在）
+    log_dir = 'logs'
+    if not os.path.exists(log_dir):
+        os.makedirs(log_dir)
+
+    # 配置根日志记录器
+    root_logger = logging.getLogger()
+    
+    # 如果已经有handlers，先移除它们以防重复
+    if root_logger.handlers:
+        for handler in root_logger.handlers[:]:
+            root_logger.removeHandler(handler)
+    
+    root_logger.setLevel(logging.INFO)
+
+    # 创建格式化器
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+
+    # 创建文件处理器
+    file_handler = RotatingFileHandler(
+        os.path.join(log_dir, 'app.log'),
+        maxBytes=10*1024*1024,  # 10MB
+        backupCount=5
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(formatter)
+
+    # 创建控制台处理器
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(formatter)
+
+    # 添加处理器到根日志记录器
+    root_logger.addHandler(file_handler)
+    root_logger.addHandler(console_handler)
+
+    # 设置propagate=False以防止日志消息向上传播
+    for logger_name in logging.root.manager.loggerDict:
+        logger = logging.getLogger(logger_name)
+        logger.propagate = False
+
+    return root_logger
+
+logger = setup_logging()
--- a/zzb_data_prod/main.py
+++ b/zzb_data_prod/main.py
--- a/zzb_data_prod/measure_config_all.txt
+++ b/zzb_data_prod/measure_config_all.txt
--- a/zzb_data_prod/not_match.txt
+++ b/zzb_data_prod/not_match.txt
--- a/zzb_data_prod/pdf_company.py
+++ b/zzb_data_prod/pdf_company.py
@ -0,0 +1,108 @@
+from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
+import mysql.connector
+from http import HTTPStatus
+import dashscope
+import random,re
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextBoxHorizontal
+
+dashscope.api_key='sk-63c02fbb9b7d4b0494a3200bec1ae286'
+    
+def get_company_name(file_path):
+    line_text = ''
+    # 我们从PDF中提取页面,page_numbers=[4,5,6]
+    for pagenum, page in enumerate(extract_pages(file_path)):
+        if pagenum > 1:
+            break
+        # 找到所有的元素
+        page_elements = [(element.y1, element) for element in page._objs]
+        # 查找组成页面的元素
+        for i,component in enumerate(page_elements):
+            # 提取页面布局的元素
+            element = component[1]
+            # 检查该元素是否为文本元素
+            if isinstance(element, LTTextBoxHorizontal):
+                # 检查文本是否出现在表中
+                line_text += element.get_text()
+
+    return llm_service(line_text)
+
+def llm_service(user_prompt):
+   
+    system_prompt = '''
+            从以下数据报告中提取公司全称，只需要提取中文公司全称，不要增加其他内容，如果提取不到公司全称，请返回-。
+            <数据报告>
+            <user_prompt>
+            </数据报告>
+            '''
+    system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
+    response = dashscope.Generation.call(
+                                model='qwen-plus',
+                                prompt = system_prompt,
+                                seed=random.randint(1, 10000),
+                                top_p=0.8,
+                                result_format='message',
+                                enable_search=False,
+                                max_tokens=1500,
+                                temperature=0.85,
+                                repetition_penalty=1.0
+                            )
+    if response.status_code == HTTPStatus.OK:
+        result = response['output']['choices'][0]['message']['content']
+        return result
+    else:
+        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+            response.request_id, response.status_code,
+            response.code, response.message
+        ))
+
+    return "llm_error"
+
+def update_company_name(file_id, company_name, cursor, conn):
+    update_sql = f'''
+                    UPDATE report_check
+                    SET c_name = '{company_name}'
+                    WHERE id = {file_id}
+                    '''
+    cursor.execute(update_sql)
+    conn.commit()
+
+if __name__ == '__main__':
+    conn = mysql.connector.connect(
+        host = MYSQL_HOST,
+        user = MYSQL_USER,
+        password = MYSQL_PASSWORD,
+        database = MYSQL_DB
+    )
+
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor()
+
+    data_query = '''
+                    SELECT id,file_path FROM report_check where c_name is null
+                '''
+
+    cursor.execute(data_query)
+    data_list = cursor.fetchall()
+
+    for data in data_list:
+        try:
+            file_id = data[0]
+            file_path = f'/usr/local/zhanglei/financial/{data[1]}'
+            print(f'财报{file_id}开始解析')
+            # file_id = '1329'
+            # file_path = '/Users/zhengfei/Desktop/cb/zhangjun-600271-2023-nb-nb.pdf'
+
+            company_name = get_company_name(file_path)
+            contains_newline = '\n' in company_name
+            if contains_newline:
+                lines = company_name.splitlines(True)
+                company_name = lines[0]
+
+            if company_name != "llm_error":
+                update_company_name(file_id, company_name, cursor, conn)
+        except Exception as e:
+            print(f'财报{file_id}解析失败',e)
+
+    cursor.close()
+    conn.close()
--- a/zzb_data_prod/pdf_company_0824.py
+++ b/zzb_data_prod/pdf_company_0824.py
@ -0,0 +1,268 @@
+from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
+import mysql.connector
+from http import HTTPStatus
+import dashscope
+import random,re
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextBoxHorizontal
+import PyPDF2
+dashscope.api_key='sk-63c02fbb9b7d4b0494a3200bec1ae286'
+    
+def get_company_name(file_path):
+    line_text = ''
+    # 我们从PDF中提取页面,page_numbers=[4,5,6]
+    for pagenum, page in enumerate(extract_pages(file_path)):
+        if pagenum > 1:
+            break
+        # 找到所有的元素
+        page_elements = [(element.y1, element) for element in page._objs]
+        # 查找组成页面的元素
+        for i,component in enumerate(page_elements):
+            # 提取页面布局的元素
+            element = component[1]
+            # 检查该元素是否为文本元素
+            if isinstance(element, LTTextBoxHorizontal):
+                # 检查文本是否出现在表中
+                line_text += element.get_text()
+
+    return llm_service(line_text)
+def get_company_code(file_path):
+    line_text = ''
+    # 我们从PDF中提取页面,page_numbers=[4,5,6]
+    for pagenum, page in enumerate(extract_pages(file_path)):
+        if pagenum > 1:
+            break
+        # 找到所有的元素
+        page_elements = [(element.y1, element) for element in page._objs]
+        # 查找组成页面的元素
+        for i,component in enumerate(page_elements):
+            # 提取页面布局的元素
+            element = component[1]
+            # 检查该元素是否为文本元素
+            if isinstance(element, LTTextBoxHorizontal):
+                # 检查文本是否出现在表中
+                line_text += element.get_text()
+
+    return llm_service_code(line_text)
+#获取公司简介的那一页
+# def get_code_page(pdf_path):
+#     with open(pdf_path, 'rb') as file:
+#         reader = PyPDF2.PdfReader(file)
+#         outlines = reader.outline
+#         company_profile_page = None
+
+#         def find_page_from_outlines(outlines):
+#             nonlocal company_profile_page
+#             for item in outlines:
+#                 if isinstance(item, list):  # 如果是子目录，则递归
+#                     find_page_from_outlines(item)
+#                 else:
+#                     title = item.title
+#                     if title is not None and '公司简介' in title:
+#                         # 获取页面的实际页码
+#                         page_num = reader.get_destination_page_number(item)
+#                         company_profile_page = page_num
+#                         return
+#                     # 处理没有标题的情况
+#                     elif item.page is not None:
+#                         page_num = reader.get_destination_page_number(item)
+#                         if page_num is not None:
+#                             pass
+
+#         find_page_from_outlines(outlines)
+
+#         return company_profile_page
+
+# def get_company_code(file_path):
+#     line_text = ''
+#     # 我们从PDF中提取页面,page_numbers=[4,5,6]
+#     for pagenum, page in enumerate(extract_pages(file_path)):
+#         print(f'页码是{get_code_page(file_path)+1}')
+#         if pagenum > 1 and pagenum != get_code_page(file_path)+1:
+#             break
+#         # 找到所有的元素
+#         #print(pagenum)
+#         page_elements = [(element.y1, element) for element in page._objs]
+#         # 查找组成页面的元素
+#         # for i,component in enumerate(page_elements):
+#         #     # 提取页面布局的元素
+#         #     element = component[1]
+#         #     # 检查该元素是否为文本元素
+#         #     if isinstance(element, LTTextBoxHorizontal):
+#         #         # 检查文本是否出现在表中
+#         #         line_text += element.get_text()
+#         for _, element in page_elements:
+#             if isinstance(element, LTTextBoxHorizontal):
+#                 # 提取文本并添加到 line_text
+#                 line_text += element.get_text()
+
+#     return llm_service_code(line_text)
+def llm_service(user_prompt):
+   
+    system_prompt = '''
+            从以下数据报告中提取公司全称，只需要提取中文公司全称，不要增加其他内容，如果提取不到公司全称，请返回-，不要返回其他任何内容。
+            <数据报告>
+            <user_prompt>
+            </数据报告>
+            '''
+    system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
+    response = dashscope.Generation.call(
+                                model='qwen-plus',
+                                prompt = system_prompt,
+                                seed=random.randint(1, 10000),
+                                top_p=0.8,
+                                result_format='message',
+                                enable_search=False,
+                                max_tokens=1500,
+                                temperature=0.85,
+                                repetition_penalty=1.0
+                            )
+    if response.status_code == HTTPStatus.OK:
+        result = response['output']['choices'][0]['message']['content']
+        return result
+    else:
+        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+            response.request_id, response.status_code,
+            response.code, response.message
+        ))
+
+    return "llm_error"
+def llm_service_code(user_prompt):
+   
+    system_prompt = '''
+            从以下数据报告中提取6位数字的股票代码，只需要提取股票代码，如果有多个则以','隔开，不要增加其他内容，如果提取不到股票代码，请返回-,不要返回其他任何内容。
+            <数据报告>
+            <user_prompt>
+            </数据报告>
+            '''
+    system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
+    response = dashscope.Generation.call(
+                                model='qwen-plus',
+                                prompt = system_prompt,
+                                seed=random.randint(1, 10000),
+                                top_p=0.8,
+                                result_format='message',
+                                enable_search=False,
+                                max_tokens=1500,
+                                temperature=0.85,
+                                repetition_penalty=1.0
+                            )
+    if response.status_code == HTTPStatus.OK:
+        result = response['output']['choices'][0]['message']['content']
+        return result
+    else:
+        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
+            response.request_id, response.status_code,
+            response.code, response.message
+        ))
+
+    return "llm_error"
+def update_company_name(file_id, company_name,company_code, cursor, conn):
+    update_sql = f'''
+                    UPDATE report_check
+                    SET c_name = '{company_name}',c_code = '{company_code}'
+                    WHERE id = {file_id}
+                    '''
+    cursor.execute(update_sql)
+    conn.commit()
+def name_code_fix(file_id,file_path):
+    conn = mysql.connector.connect(
+        host = MYSQL_HOST,
+        user = MYSQL_USER,
+        password = MYSQL_PASSWORD,
+        database = MYSQL_DB
+    )
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor()
+
+    try:
+        # file_id = data[0]
+        # #生产环境地址
+        # file_path = f'/usr/local/zhanglei/financial{data[1]}'
+        # #测试环境地址
+        # # file_path_1 = f'/root/pdf_parser/pdf/{data[1]}'
+        # # file_path = file_path_1.replace('/upload/file/','')
+        # print(f'财报{file_id}开始解析')
+        # #file_id = '305'
+        # #file_path = r"F:\11_pdf\7874.pdf"
+        company_name = get_company_name(file_path)
+        contains_newline = '\n' in company_name
+        if contains_newline:
+            lines = company_name.splitlines(True)
+            company_name = lines[0]
+
+        company_code = get_company_code(file_path)
+        contains_newline1 = '\n' in company_code
+        if contains_newline1:
+            lines = company_code.splitlines(True)
+            company_code = lines[0]
+
+        if company_name != "llm_error" or company_code != "llm_error":
+            #print(company_code)
+            pattern = re.compile(r'^(\d{6}|\d{6}(,\d{6})*)$')
+            if not pattern.match(company_code):
+                company_code = '-'
+            if len(company_name) > 15 or company_name == '-':
+                company_name = ''
+            update_company_name(file_id, company_name,company_code, cursor, conn)
+    except Exception as e:
+        print(f'财报解析失败',e)
+
+    cursor.close()
+    conn.close()
+
+
+
+if __name__ == '__main__':
+    conn = mysql.connector.connect(
+        host = MYSQL_HOST,
+        user = MYSQL_USER,
+        password = MYSQL_PASSWORD,
+        database = MYSQL_DB
+    )
+    
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor()
+
+    data_query = '''
+                    SELECT id,file_path FROM report_check where  c_code is null
+                '''
+
+    cursor.execute(data_query)
+    data_list = cursor.fetchall()
+
+    for data in data_list:
+        try:
+            file_id = data[0]
+            #生产环境地址
+            file_path = f'/usr/local/zhanglei/financial{data[1]}'
+            #测试环境地址
+            # file_path_1 = f'/root/pdf_parser/pdf/{data[1]}'
+            # file_path = file_path_1.replace('/upload/file/','')
+            print(f'财报{file_id}开始解析')
+            #file_id = '305'
+            #file_path = r"F:\11_pdf\7874.pdf"
+
+            company_name = get_company_name(file_path)
+            contains_newline = '\n' in company_name
+            if contains_newline:
+                lines = company_name.splitlines(True)
+                company_name = lines[0]
+
+            company_code = get_company_code(file_path)
+            contains_newline1 = '\n' in company_code
+            if contains_newline1:
+                lines = company_code.splitlines(True)
+                company_code = lines[0]
+
+            if company_name != "llm_error" or company_code != "llm_error":
+                #print(company_code)
+                pattern = re.compile(r'^(\d{6}|\d{6}(,\d{6})*)$')
+                if not pattern.match(company_code):
+                    company_code = '-'
+                update_company_name(file_id, company_name,company_code, cursor, conn)
+        except Exception as e:
+            print(f'财报解析失败',e)
+
+    cursor.close()
+    conn.close()
--- a/zzb_data_prod/pdf_delete.py
+++ b/zzb_data_prod/pdf_delete.py
@ -0,0 +1,98 @@
+#报错提示
+import paramiko
+import time
+import threading
+
+# 执行命令的函数
+def execute_commands_on_server(hostname, username, password, host):
+    try:
+        # 连接到服务器
+        client = paramiko.SSHClient()
+        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        client.connect(hostname=hostname, username=username, password=password)
+
+        # 执行命令
+        shell = client.invoke_shell()
+        #启动docker
+        shell.send("cd /root/pdf_parser/pdf\n")
+        time.sleep(1)
+        shell.send("rm -f *.pdf\n")
+        time.sleep(10)
+        shell.send("rm -f *.PDF\n")
+        time.sleep(10)
+        # 读取输出
+        output = shell.recv(2048).decode()
+        print(f"Output from {hostname}:\n{output}")
+
+    except paramiko.SSHException as e:
+        print(f"SSH connection error with {hostname}: {e}")
+
+    finally:
+        client.close()
+
+# 创建线程函数
+def thread_function(server):
+    execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
+
+# 服务器列表
+# servers = [
+#     {'hostname': 'server1.example.com', 'username': 'user1', 'password': 'pass1', 'host': 'host1'},
+#     {'hostname': 'server2.example.com', 'username': 'user2', 'password': 'pass2', 'host': 'host2'},
+#     # 添加更多服务器
+# ]
+servers = [
+    #{'hostname': '124.70.129.232', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器'},
+    # {'hostname': '1.94.179.121', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器'},#废弃
+
+#旧10台
+    {'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器1'},
+    {'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器2'},
+    {'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器3'},
+    {'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器4'},
+   
+    {'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器5'},
+   # {'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器6'},#都往这里存
+    {'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器7'},
+    {'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器8'},
+    {'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器9'},
+    {'hostname': '123.60.28.83', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器10'},
+#新10台
+    {'hostname': '192.168.0.19', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器1'},
+    {'hostname': '192.168.0.53', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器2'},
+    {'hostname': '192.168.0.150', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器3'},
+    {'hostname': '192.168.0.210', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器4'},
+   
+    {'hostname': '192.168.0.129', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器5'},
+    {'hostname': '192.168.0.24', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器6'},
+    {'hostname': '192.168.0.250', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器7'},
+    {'hostname': '192.168.0.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器8'},
+    {'hostname': '192.168.0.86', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器9'},
+    {'hostname': '192.168.0.88', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器10'},
+#再来11台新的
+    {'hostname': '192.168.0.93', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器1'},
+    {'hostname': '192.168.0.228', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器2'},
+    {'hostname': '192.168.0.155', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器3'},
+    {'hostname': '192.168.0.186', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器4'},
+   
+    {'hostname': '192.168.0.56', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器5'},
+    {'hostname': '192.168.0.185', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器6'},
+    {'hostname': '192.168.0.72', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器7'},
+    {'hostname': '192.168.0.35', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器8'},
+    {'hostname': '192.168.0.230', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器9'},
+    {'hostname': '192.168.0.125', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器10'},
+    {'hostname': '192.168.0.46', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新1生产服务器11'},
+#    
+]
+
+# 创建并启动线程
+threads = []
+for server in servers:
+    thread = threading.Thread(target=thread_function, args=(server,))
+    threads.append(thread)
+    thread.start()
+
+# 等待所有线程完成
+for thread in threads:
+    thread.join()
+
+print("All commands executed.")
--- a/zzb_data_prod/pdf_title.py
+++ b/zzb_data_prod/pdf_title.py
@ -0,0 +1,240 @@
+import PyPDF2
+import re
+import os,threading
+from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
+import redis
+import db_service
+def get_tree_pages(root, info, depth=0,title_array=[]):
+    """
+        Recursively iterate the outline tree
+        Find the pages pointed by the outline item
+        and get the assigned physical order id
+
+        Decrement with padding if necessary
+    """
+    
+    if isinstance(root, dict):
+        # print(root)
+        page = root['/Page'].get_object()
+        # print(id(page))
+        t = root['/Title']
+        title = t
+        if isinstance(t, PyPDF2.generic.ByteStringObject):
+            title = t.original_bytes.decode('utf8')
+        title = title.strip()
+        title = title.replace('\n', '')
+        title = title.replace('\r', '')
+
+        page_num = info['all_pages'].get(id(page), 0)
+        if page_num == 0:
+            print('Not found page number for /Page!', page)
+        elif page_num < info['padding']:
+            page_num = 0
+        else:
+            page_num -= info['padding']
+
+
+        # str_val = '%-5d' % page_num
+        # str_val += '\t' * depth
+        # str_val += title + '\t' + '%3d' % page_num
+        # print(str_val)
+        title_array.append({
+            'title': title,
+            'page_num': page_num,
+            'depth': depth
+        })
+    for elem in root:
+        get_tree_pages(elem, info, depth+1,title_array)
+    return title_array
+
+
+def recursive_numbering(obj, info):
+    """
+        Recursively iterate through all the pages in order and assign them a physical
+        order number
+    """
+    # print(id(obj), obj)
+    if obj['/Type'] == '/Page':
+        obj_id = id(obj)
+        if obj_id not in info['all_pages']:
+            info['all_pages'][obj_id] = info['current_page_id']
+        info['current_page_id'] += 1
+        return
+    elif obj['/Type'] == '/Pages':
+        for page in obj['/Kids']:
+            recursive_numbering(page.get_object(), info)
+
+def get_numbers_between(numbers_between,start, end):
+    # 初始化一个空列表来存储两个数字之间的所有数字
+
+    # 遍历从开始数字到结束数字之间的每个数字
+    for i in range(start, end + 1):
+        # 将每个数字添加到列表中
+        numbers_between.append(i)  
+    return numbers_between
+
+def get_page_end(start, depth, title_array):
+    page_end = -1
+    for i in range(start, len(title_array)):
+        if title_array[i]['depth'] == depth:
+            page_end = title_array[i]['page_num']
+            break
+    return page_end
+
+def get_file_split(page_count):
+    # 获取 CPU 核数
+    cpu_count = 4
+    if page_count < cpu_count:
+        cpu_count = page_count
+    # 使用 divmod() 函数计算除法结果和余数
+    quotient, remainder = divmod(page_count, cpu_count)
+    table_split_parts = []
+    text_split_parts = []
+    for i in range(cpu_count):
+        start_num = i * quotient
+        if i < cpu_count-1:
+            start_num = i * quotient
+            end_num = start_num+quotient
+        else:
+            end_num = page_count
+        table_split_parts.append(f'{start_num}-{end_num}')
+        text_split_parts.append(get_numbers_between([],start_num, end_num))
+
+    # 返回除法结果和余数
+    return {
+        'table_split_parts': table_split_parts,
+        'text_split_parts': text_split_parts
+    }
+    
+def create_text_outline(pdf_path, file_id):
+    # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
+    # creating an object 
+    with open(pdf_path, 'rb') as file:
+        file_info = {}
+        fileReader = PyPDF2.PdfReader(file)
+        page_count = len(fileReader.pages)
+
+        redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
+        redis_client.set(f'page_count_{file_id}', page_count)
+
+        info = {
+            'page_count': page_count,
+            'all_pages': {},
+            'current_page_id': 1, 
+            'padding': 0
+        }
+
+        print('Number of pages: %d' % info['page_count'])
+
+        pages = fileReader.trailer['/Root']['/Pages'].get_object()
+        recursive_numbering(pages, info)
+        #for page_num, page in enumerate(pages['/Kids']):
+        #    page_obj = page.getObject()
+        #    all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
+        title_array = get_tree_pages(fileReader.outline, info, 0, [])
+        db_service.pdf_title_insert_mysql(file_id,title_array)
+        title_array = db_service.get_file_info_from_mysql(file_id)
+        
+        parent_table_pages_local = {}
+        parent_table_pages_local[file_id] = []
+        print(f'{file_id}:{len(title_array)}')
+        for i in range(len(title_array)):
+            title_obj = title_array[i]
+            title  = title_obj['title']
+            #print(f'标题分别是{title}')
+            if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
+                page_start = title_obj['page_num']
+                depth = title_obj['depth']
+                if i < len(title_array) - 1:
+                    page_end = title_array[i+1]['page_num']
+                    if title_array[i]['depth'] in [1,2]:
+                        page_end = get_page_end(i+1, depth, title_array)
+                else:
+                    page_end = page_count
+                print(f'目录识别时被丢弃的页码：{page_start}-{page_end}')
+                
+                #当标题为母公司财务报表主要项目注释时，最后一页不过滤，避免核心roe指标无法召回
+                if len(re.findall('财务报表主要项目注释', title)) == 0:
+                    page_end = page_end - 1
+                # print(title,page_start,page_end)
+                for i in range(page_start, page_end + 1):
+                    # 将每个数字添加到列表中
+                    parent_table_pages_local[file_id].append(i) 
+        file_info['page_count'] = page_count
+        file_info['parent_table_pages'] = parent_table_pages_local[file_id]
+        file_info['split_parts'] = get_file_split(page_count)
+
+        redis_client.close()
+
+        return file_info
+
+
+def create_text_outline_disclosure(pdf_path, file_id):
+    # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
+    # creating an object
+    with open(pdf_path, 'rb') as file:
+        file_info = {}
+        fileReader = PyPDF2.PdfReader(file)
+        page_count = len(fileReader.pages)
+
+        redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
+        redis_client.set(f'page_count_{file_id}', page_count)
+
+        info = {
+            'page_count': page_count,
+            'all_pages': {},
+            'current_page_id': 1,
+            'padding': 0
+        }
+
+        print('Number of pages: %d' % info['page_count'])
+
+        pages = fileReader.trailer['/Root']['/Pages'].get_object()
+        recursive_numbering(pages, info)
+        #for page_num, page in enumerate(pages['/Kids']):
+        #    page_obj = page.getObject()
+        #    all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
+        title_array = get_tree_pages(fileReader.outline, info, 0, [])
+        #db_service.pdf_title_insert_mysql(file_id,title_array)
+        #title_array = db_service.get_file_info_from_mysql(file_id)
+
+        parent_table_pages_local = {}
+        parent_table_pages_local[file_id] = []
+        print(f'{file_id}:{len(title_array)}')
+        for i in range(len(title_array)):
+            title_obj = title_array[i]
+            title  = title_obj['title']
+            #print(f'标题分别是{title}')
+            if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
+                page_start = title_obj['page_num']
+                depth = title_obj['depth']
+                if i < len(title_array) - 1:
+                    page_end = title_array[i+1]['page_num']
+                    if title_array[i]['depth'] in [1,2]:
+                        page_end = get_page_end(i+1, depth, title_array)
+                else:
+                    page_end = page_count
+                print(f'目录识别时被丢弃的页码：{page_start}-{page_end}')
+
+                #当标题为母公司财务报表主要项目注释时，最后一页不过滤，避免核心roe指标无法召回
+                if len(re.findall('财务报表主要项目注释', title)) == 0:
+                    page_end = page_end - 1
+                # print(title,page_start,page_end)
+                for i in range(page_start, page_end + 1):
+                    # 将每个数字添加到列表中
+                    parent_table_pages_local[file_id].append(i)
+        file_info['page_count'] = page_count
+        file_info['parent_table_pages'] = parent_table_pages_local[file_id]
+        file_info['split_parts'] = get_file_split(page_count)
+
+        redis_client.close()
+
+        return file_info
+if __name__ == '__main__':
+    import time
+    path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
+
+    threading.Thread(target=create_text_outline, args=(path,'111')).start()
+    time.sleep(5)
+    threading.Thread(target=create_text_outline, args=(path,'222')).start()
+
--- a/zzb_data_prod/put_code.sh
+++ b/zzb_data_prod/put_code.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+# 设置文件路径和目标目录#   请注意这列的config文件是不可以进行传输的  /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
+#FILES="/root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py /root/pdf_parser/zzb_data_prod/app.py /root/pdf_parser/zzb_data_prod/main.py /root/pdf_parser/zzb_data_prod/pdf_title.py"
+FILES="/root/pdf_parser/zzb_data_prod/put_code.sh"
+DEST_PATH="/root/pdf_parser/zzb_data_prod"
+
+# 设置服务器列表 主服务器 "1.94.143.23"  "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
+#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103"  "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
+#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
+#监管服务器
+#SERVERS=("192.168.0.108" "192.168.0.131")
+#企业服务器
+#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
+#两者一起
+SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
+# 遍历每个服务器并上传文件
+for SERVER in "${SERVERS[@]}"; do
+    echo "Uploading files to $SERVER"
+    scp -r $FILES root@$SERVER:$DEST_PATH
+    echo "Finished uploading to $SERVER"
+done
+
+
--- a/zzb_data_prod/redis_service.py
+++ b/zzb_data_prod/redis_service.py
@ -0,0 +1,16 @@
+import redis
+# 从 MySQL 表中读取数据并写入 Redis
+def read_from_file_and_write_to_redis(redis_client,ori_measure_id,measure_vector):
+    # Redis 连接配置    
+    redis_client.hset('measure_config',ori_measure_id, measure_vector)
+
+# 从 Redis 中读取数据
+def read_from_redis(redis_client,ori_measure_id):
+    # 获取所有键
+    return redis_client.hget('measure_config',ori_measure_id).decode()
+    
+if __name__ == "__main__":
+    redis_client = redis.Redis(host='192.168.0.175', port=6379, password='Xgf_redis', db=6)
+
+    value = read_from_redis(redis_client,"bb3cf43f3dba147373c706c6567b5a")
+    print(value)
--- a/zzb_data_prod/requirements.txt
+++ b/zzb_data_prod/requirements.txt
@ -0,0 +1,15 @@
+camelot-py==0.11.0
+pdfminer.six==20221105
+PyPDF2==3.0.1
+pdfplumber==0.10.3
+pymilvus==2.3.3
+mysql-connector-python==8.3.0
+dashscope==1.17.0
+fastapi
+pydantic
+uvicorn
+redis
+ghostscript
+opencv-python-headless
+python-docx
+docx2pdf
--- a/zzb_data_prod/requirements_lite.txt
+++ b/zzb_data_prod/requirements_lite.txt
@ -0,0 +1,6 @@
+pdfminer.six==20221105
+PyPDF2==3.0.1
+pdfplumber==0.10.3
+pymilvus==2.3.3
+mysql-connector-python==8.3.0
+dashscope==1.17.0
--- a/zzb_data_prod/test.pdf
+++ b/zzb_data_prod/test.pdf
@ -0,0 +1,3 @@
+--2024-12-27 11:23:36--  https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
+Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
+wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
--- a/zzb_data_prod/test.py
+++ b/zzb_data_prod/test.py
@ -0,0 +1,154 @@
+#coding=utf-8
+import sys,ast
+from pdfminer.high_level import extract_text
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+import utils
+import mysql.connector
+from pymilvus import connections,MilvusClient
+import json
+import db_service
+import ast
+import numpy as np
+import config
+import redis_service
+from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
+import main
+import redis
+
+def measure_config_to_db(conn,cursor):
+    insert_query = '''
+                INSERT INTO measure_config
+                (measure_id, measure_name, ori_measure_id, ori_measure_name) 
+                VALUES (%s, %s, %s, %s)
+                '''
+    check_query = '''
+                select ori_measure_id from measure_config
+                '''
+    # 打开文本文件
+    with open('/Users/zhengfei/work/zzb_data/measure_config_all.txt', 'r') as file:
+        # 读取所有行到一个列表中
+        lines = file.readlines()
+
+    # 打印每一行
+    for line in lines:
+        config_list = line.strip().split(',')
+        measure = config_list[0]
+        ori_measure = config_list[1]
+        ori_measure_id = utils.get_md5(ori_measure)
+        # 判断数据库中是否有数据
+        # cursor.execute(check_query.format(ori_measure_id=ori_measure_id))
+        # check_records = cursor.fetchall()
+        # if(len(check_records)) > 0:
+        #     continue
+        data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
+        cursor.execute(insert_query, data_to_insert)
+        conn.commit()   
+
+def insert_measure_vector(conn,cursor):
+
+    redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)
+    # 执行SQL语句，更新数据
+    select_query = '''
+                SELECT ori_measure_id,ori_measure_name FROM measure_config
+                '''
+    cursor.execute(select_query)
+    records = cursor.fetchall()
+    for record in records:
+        if redis_client.hexists('measure_config', record[0]):
+            measure_vector = redis_client.hget('measure_config', record[0])
+        else:
+            print('新增指标',record[1])
+            vector_obj = utils.embed_with_str(record[1])
+            measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
+
+        redis_client.hset('measure_config', record[0], measure_vector)
+    redis_client.close()
+    conn.close()
+
+def contains_financial_indicators(text):
+    import re
+    # 正则表达式模式匹配千分位格式的数字和百分比
+    pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
+
+    pattern1 = r"\d+(.\d+)+%?"
+    # 使用 re.search 函数查找匹配项
+    match = re.search(pattern1, text)
+
+    # 如果找到匹配项，返回 True，否则返回 False
+    return bool(match)
+
+def get_clean_text(text):
+    import re
+    pattern = r"\（[^)]*?\）"
+    matches = re.findall(pattern, text)
+    for match in matches:
+        # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
+        month_keywords_found = re.search(r"归属于|扣非", match)
+        if not month_keywords_found:
+            # 如果包含，则从文本中删除该部分
+            text = re.sub(pattern,"", text)
+        else:
+            # 如果不包含，删除所有标点符号和中文数字
+            text = re.sub(r"[^\w\s]", "", text)  
+    print(text)
+
+def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
+    # #通过向量查询指标
+    db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
+
+    # #指标归一化处理
+    db_service.update_ori_measure(conn,cursor,file_id)
+
+def print_measure_data(cursor,client):
+    select_query = '''
+                    SELECT ori_measure_name,measure_name,ori_measure_id  FROM measure_config
+                    where measure_id not in(select distinct measure_id  from ori_measure_list  where file_id='64')
+                '''
+    cursor.execute(select_query)
+    records = cursor.fetchall()
+    for record in records:
+        ori_measure_name = record[0]
+        measure_name = record[1]
+        ori_measure_id = record[2]
+        measure_vector = redis_service.read_from_redis(ori_measure_id)
+       
+        measure_list = ast.literal_eval(measure_vector)
+        data = [measure_list]
+        res = client.search(
+            collection_name="pdf_measure_v4", # Replace with the actual name of your collection
+            # Replace with your query vector
+            data=data,
+            limit=2, # Max. number of search results to return
+            search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
+            output_fields=["measure_name","measure_value","table_num","table_index"],
+            filter = 'file_id == "64"'
+        )
+        vector_str = measure_name+":"+ori_measure_name 
+        # Convert the output to a formatted JSON string
+        for i in range(len(res[0])):
+            
+            vector_distance = float(res[0][i]["distance"])
+            vector_measure_name = res[0][i]["entity"]["measure_name"]
+            measure_value = res[0][i]["entity"]["measure_value"]
+            table_num = res[0][i]["entity"]["table_num"]
+            table_index = res[0][i]["entity"]["table_index"]
+            table_num_list = [106]
+            print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
+            # if vector_distance > 0.89 and table_num not in table_num_list:
+            #     print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
+            # if vector_distance > distance and table_num not in table_num_list:   
+            #     print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
+        
+
+if __name__ == "__main__":
+    conn = mysql.connector.connect(
+        host=MYSQL_HOST,
+        user=MYSQL_USER,
+        password=MYSQL_PASSWORD,
+        database=MYSQL_DB
+    )
+    cursor = conn.cursor()
+
+    insert_measure_vector(conn,cursor)
--- a/zzb_data_prod/test0710.py
+++ b/zzb_data_prod/test0710.py
@ -0,0 +1 @@
+import re
--- a/zzb_data_prod/test_0711.py
+++ b/zzb_data_prod/test_0711.py
@ -0,0 +1,92 @@
+#import camelot
+import re
+#from multiprocessing import Pool
+import os, time, random
+import json
+#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
+from datetime import datetime
+# 读取PDF
+import PyPDF2
+# 分析PDF的layout，提取文本
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextBoxHorizontal
+import pdfplumber
+import mysql.connector
+#import utils
+from pymilvus import MilvusClient
+#import llm_service
+#import db_service
+#import pdf_title
+import numpy as np
+#from multiprocessing import Process
+
+
+
+
+def text_in_table(top, tables_range, page_num):
+    if tables_range.get(page_num):
+        for range in tables_range[page_num]:
+            if top < range['top'] and top > range['buttom']:
+                return True
+    return False
+
+def get_text_type(text: str):
+    text = re.sub(r"\s", "", text)
+    first_re = '年度报告'
+    page_number_pattern = re.compile(r'^\d+(/\d+)?$')
+    
+    if re.search(first_re, text.strip()):
+         return 'page_header'
+    
+    if page_number_pattern.match(text.strip()):
+        return 'page_footer'
+    
+    return 'text'
+def get_text_content_test(file_path,file_id,pages,tables_range):
+    page_start = pages.split('-')[0]
+    page_end = pages.split('-')[1]
+
+    
+    # 我们从PDF中提取页面,page_numbers=[4,5,6]
+    for pagenum, page in enumerate(extract_pages(pdf_path)):
+        try:
+            if pagenum+1 < int(page_start) or pagenum+1 > int(page_end):
+                continue
+            # 找到所有的元素
+            page_elements = [(element.y1, element) for element in page._objs]
+            # 查找组成页面的元素
+            for i,component in enumerate(page_elements):
+                # 提取页面布局的元素
+                element = component[1]
+                # 检查该元素是否为文本元素
+                if isinstance(element, LTTextBoxHorizontal):
+                    # 检查文本是否出现在表中
+                    line_text = element.get_text().replace('\n','')
+                    line_text = re.sub(r"\s", "", line_text)
+                    #print(f'line_text 的值是{line_text}')
+                
+                    element_top = element.bbox[3]
+                    element_buttom = element.bbox[1]
+
+                    # 检查该文本是否出现在表中
+                    if tables_range.get(pagenum+1):
+                        for range in tables_range[pagenum+1]:
+                            if element_top < range['top'] and element_top > range['buttom']:
+                                pass
+                            else:
+                                if element_top - range['top'] < 150 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
+                                    text_type = get_text_type(line_text)
+                                    if text_type == 'page_header':
+                                        break
+
+                                    # 记录需要过滤掉的页码
+                                    if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
+                                        print('成功识别到了')
+        except Exception as e:
+            print(f"Error processing page {pagenum+1}: {e}")
+
+pdf_path = r"combined_v61.pdf"
+file_id = 1
+tables_range = {1: [{'top': 727.0118072976055, 'buttom': 77.52552451539339, 'table_index': 1, 'page_num': 1}], 2: [{'top': 687.408985176739, 'buttom': 77.04549030786774, 'table_index': 1, 'page_num': 2}]} 
+pages = '1-2'
+get_text_content_test(pdf_path,file_id,pages,tables_range)
--- a/zzb_data_prod/test_0711_v2.py
+++ b/zzb_data_prod/test_0711_v2.py
@ -0,0 +1,325 @@
+import camelot
+import re
+#from multiprocessing import Pool
+import os, time, random
+import json
+#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
+from datetime import datetime
+# 读取PDF
+import PyPDF2
+# 分析PDF的layout，提取文本
+from pdfminer.high_level import extract_pages
+from pdfminer.layout import LTTextBoxHorizontal
+import pdfplumber
+import mysql.connector
+#import utils
+from pymilvus import MilvusClient
+#import llm_service
+#import db_service
+#import pdf_title
+import numpy as np
+#from multiprocessing import Process
+
+STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
+#负责表内一旦出现某个字符，整个表丢弃
+PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类'
+#unit_pattern = re.compile(r'单位[：|:]?(百万元|千万元|亿元|万元|千元|元)')
+MUILT_PATTERN = '调整前'
+file_path  =  r"combined_v61.pdf"
+file_id = 1
+pages = '1-2'
+tables_range = {}
+# def get_table_range_test(file_path, file_id, pages, tables_range):
+
+#     print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
+#     #(f'file_path: {file_path},file_id:{file_id},pages:{pages},tables_range:{tables_range}')
+#     start = time.time()
+#     import tempfile
+#     temp_dir_path = "F:\\temp"
+
+#     # 检查并创建临时文件夹
+#     if not os.path.exists(temp_dir_path):
+#         os.makedirs(temp_dir_path)
+
+#     # 创建临时文件夹
+#     temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
+#     # 设置全局临时文件夹路径
+#     os.environ["TMP"] = temp_dir
+#     os.environ["TEMP"] = temp_dir
+#     # conn = mysql.connector.connect(
+#     #     host= MYSQL_HOST,
+#     #     user= MYSQL_USER,
+#     #     password= MYSQL_PASSWORD,
+#     #     database= MYSQL_DB
+#     # )
+
+#     # 创建一个cursor对象来执行SQL语句
+#     #print(f'file_path的值是{file_path}')
+#     #cursor = conn.cursor()
+#     # try:
+#     #     tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
+#     #     print('读取成功')
+#     # except Exception as e:
+#     #     print(f'错误在{e}')
+#     #print(f'file_path的值是{file_path}')
+#     #file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
+#     os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"
+#     try:
+#     # 确保 file_path 是正确的，并且文件是可访问的
+#         if not os.path.exists(file_path):
+#             print(f'文件路径不正确或文件不存在: {file_path}')
+#             raise FileNotFoundError(f"文件不存在：{file_path}")
+#         else:
+#             pass#(f'file_path是存在的就是{file_path}')
+
+#         # 读取 PDF 文件
+#         #tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
+#         #tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
+#         tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)#line_scale=10,
+        
+#         print('读取成功')
+#         print("检测到的表格数量：", tables.n)
+#     except FileNotFoundError as fe:
+#         print(fe)
+#     except Exception as e:
+#         print(f'处理PDF时出错: {e}')
+#     for t in tables:
+        
+#         top = t._bbox[3]
+#         buttom = t._bbox[1]
+#         page_num = int(t.page)
+#         table_index = int(t.order)
+#         arr = np.array(t.data)
+#         #recent_value = None  
+#         #这里开始对可能解析错误的值做判断：
+#         for i, row in enumerate(arr):
+#             if len(row) >= 4:
+#                 # first_value = row[0]
+#                 # if ("2023年度" in first_value or "2022年度" in first_value) and len(first_value) <= 12:
+#                 #     recent_value = first_value
+#                 # if first_value == '' and recent_value:
+#                 #     row[0] = recent_value
+#                 # 检查条件:第一列不为数字，第二列和第四列为空，第三列有三个小数点【三列的数字被识别到一起了】
+#                 if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
+#                     split_values = row[2].split('.')
+#                     # 确保可以正确拆分成三个数值
+#                     if len(split_values) == 4:
+#                         new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
+#                         new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
+#                         new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
+#                         row[1] = new_value1
+#                         row[2] = new_value2
+#                         row[3] = new_value3
+#                 #检查条件：第一列不为数字，第二列第四列为空，第三列两个小数点，第五列两个小数点【两列的数字被识别到一起了】
+#                 if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2  and len(row[4].rsplit('.', 1)[-1]) == 2:
+#                     split_value_3 = row[2].split('.')
+#                     split_value_5 = row[4].split('.')
+                    
+#                     if len(split_value_3) == 3:
+#                         new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
+#                         new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
+                    
+#                     if len(split_value_5) == 3:
+#                         new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
+#                         new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
+                    
+#                     row[1] = new_value2
+#                     row[2] = new_value3
+#                     row[3] = new_value4
+#                     row[4] = new_value5
+#                 #检查条件：第一列不为数字，第二列为空，第三列有两个小数点，第四列为正常数字【两列的数字被识别到一起了】
+#                 if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
+#                     split_values = row[2].split('.')
+#                     if len(split_values) == 3:
+#                         new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
+#                         new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
+#                         row[1] = new_value2
+#                         row[2] = new_value3
+#                 #检查条件：第一列不位数字，后面有一列中的值存在“%”并且"%"不是结尾，就进行拆分
+#                 if not row[0].replace('.', '', 1).isdigit():
+#                     for i in range(1, len(row) - 1):
+#                         if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
+#                             split_values = row[i + 1].split('%')
+#                             new_value1 = f"{split_values[0]}%"
+#                             new_value2 = f"{split_values[1]}"
+#                             row[i] = new_value1
+#                             row[i + 1] = new_value2
+#                             break 
+                
+#                 #检查条件：当一个列表中同时出现了2022年12月31日和2023年1月1日时【并且都只出现1次】，在2022年12月31日后面增加“调整前”字段
+#                 # if sum(1 for item in row if item.strip() == "2023年1月1日") == 1 and sum(1 for item in row if item.strip() == "2022年12月31日") == 1:
+#                 #     for i, item in enumerate(row):
+#                 #         stripped_item = item.strip() #去空格
+#                 #         if stripped_item == "2022年12月31日":
+#                 #             row[i] = stripped_item + '调整前'
+
+#         new_data = arr.tolist()#用于后面保存到数据库中
+
+        
+#         rows, cols = arr.shape
+#         if rows == 1 and cols == 1:
+#             continue          
+#         arr_str = ''.join([''.join(map(str, row)) for row in arr])
+#         #print(f'arr_str的值是  {arr_str}')
+#         #过滤掉不包含需抽取指标表格的文本
+#         matches = re.findall(STR_PATTERN, arr_str)
+#         pattern = re.findall(PATTERN,arr_str)
+#         muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
+#         if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
+#             if not tables_range.get(page_num):
+#                 tables_range[page_num] = []
+            
+#             tables_range[page_num].append({
+#                 'top' : top,
+#                 'buttom' : buttom,
+#                 'table_index' : table_index,
+#                 'page_num' : page_num,
+#             })
+#         print(f"tables_range的值是{tables_range}")
+#             #(f'file_id是{file_id}')
+
+#             # db_service.insert_pdf_parse_process({
+#             # 'file_id': file_id,
+#             # 'page_num' : page_num,
+#             # 'page_count' : 100,
+#             # 'type' : 'parse_table',
+#             # 'content':{  
+#             #     'top' : top,
+#             #     'buttom' : buttom,
+#             #     'page_num' : page_num,
+#             #     'table_index' : table_index,
+#             #     "type" : "table",
+#             #     "data" : new_data,
+#             #     'sort_num' : page_num*1000 - top
+#             # }},conn,cursor)
+    
+#     #get_text_content(file_path, file_id, tables_range, pages, conn, cursor)
+
+#     # cursor.close()
+#     # conn.close()
+
+#     end = time.time()
+#     print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
+def get_table_range_test(file_path, file_id, pages, tables_range):
+
+    print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
+    start = time.time()
+
+    # conn = mysql.connector.connect(
+    #     host= MYSQL_HOST,
+    #     user= MYSQL_USER,
+    #     password= MYSQL_PASSWORD,
+    #     database= MYSQL_DB
+    # )
+
+    # 创建一个cursor对象来执行SQL语句
+    #cursor = conn.cursor()
+
+    #redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
+
+    tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
+    for t in tables:
+        
+        top = t._bbox[3]
+        buttom = t._bbox[1]
+        page_num = int(t.page)
+        table_index = int(t.order)
+        arr = np.array(t.data)
+        #这里开始对可能解析错误的值做判断：
+        for i, row in enumerate(arr):
+            if len(row) >= 4:
+                # 检查条件:第一列不为数字，第二列和第四列为空，第三列有三个小数点【三列的数字被识别到一起了】
+                if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
+                    split_values = row[2].split('.')
+                    # 确保可以正确拆分成三个数值
+                    if len(split_values) == 4:
+                        new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
+                        new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
+                        new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
+                        row[1] = new_value1
+                        row[2] = new_value2
+                        row[3] = new_value3
+                #检查条件：第一列不为数字，第二列第四列为空，第三列两个小数点，第五列两个小数点【两列的数字被识别到一起了】
+                if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2  and len(row[4].rsplit('.', 1)[-1]) == 2:
+                    split_value_3 = row[2].split('.')
+                    split_value_5 = row[4].split('.')
+                    
+                    if len(split_value_3) == 3:
+                        new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
+                        new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
+                    
+                    if len(split_value_5) == 3:
+                        new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
+                        new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
+                    
+                    row[1] = new_value2
+                    row[2] = new_value3
+                    row[3] = new_value4
+                    row[4] = new_value5
+                #检查条件：第一列不为数字，第二列为空，第三列有两个小数点，第四列为正常数字【两列的数字被识别到一起了】
+                if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
+                    split_values = row[2].split('.')
+                    if len(split_values) == 3:
+                        new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
+                        new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
+                        row[1] = new_value2
+                        row[2] = new_value3
+                #检查条件：第一列不位数字，后面有一列中的值存在“%”并且"%"不是结尾，就进行拆分
+                if not row[0].replace('.', '', 1).isdigit():
+                    for i in range(1, len(row) - 1):
+                        if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
+                            split_values = row[i + 1].split('%')
+                            new_value1 = f"{split_values[0]}%"
+                            new_value2 = f"{split_values[1]}"
+                            row[i] = new_value1
+                            row[i + 1] = new_value2
+                            break 
+
+        new_data = arr.tolist()#用于后面保存到数据库中
+        rows, cols = arr.shape
+        if rows == 1 and cols == 1:
+            continue          
+        arr_str = ''.join([''.join(map(str, row)) for row in arr])
+        
+        #过滤掉不包含需抽取指标表格的文本
+        matches = re.findall(STR_PATTERN, arr_str)
+        pattern = re.findall(PATTERN,arr_str)
+        muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
+        if len(matches) > 0  and len(pattern) == 0 and len(muilt_pattern)<5:
+            if not tables_range.get(page_num):
+                tables_range[page_num] = []
+            
+            tables_range[page_num].append({
+                'top' : top,
+                'buttom' : buttom,
+                'table_index' : table_index,
+                'page_num' : page_num,
+            })
+            print(f"tables_range的值是{tables_range}")
+
+    #         db_service.insert_pdf_parse_process({
+    #         'file_id': file_id,
+    #         'page_num' : page_num,
+    #         'page_count' : 100,
+    #         'type' : 'parse_table',
+    #         'content':{  
+    #             'top' : top,
+    #             'buttom' : buttom,
+    #             'page_num' : page_num,
+    #             'table_index' : table_index,
+    #             "type" : "table",
+    #             "data" : new_data,
+    #             'sort_num' : page_num*1000 - top
+    #         }},conn,cursor)
+    
+    # get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client)
+
+    # cursor.close()
+    # conn.close()
+    # redis_client.close()
+
+    end = time.time()
+    print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
+
+
+get_table_range_test(file_path, file_id, pages, tables_range)
--- a/zzb_data_prod/utils.py
+++ b/zzb_data_prod/utils.py
@ -0,0 +1,780 @@
+#coding=utf-8
+
+import dashscope
+from http import HTTPStatus
+from pymilvus import MilvusClient
+import json
+from datetime import datetime
+import re,os,time
+import requests
+import config
+import numpy as np
+from docx2pdf import convert
+from config import api_key
+import logging
+logger = logging.getLogger(__name__)
+
+dashscope.api_key = api_key
+
+
+def get_md5(str):
+    import hashlib
+    m = hashlib.md5()
+    m.update(str.encode('utf-8'))
+    return m.hexdigest()
+
+def embed_with_str(input):
+    retry = 0
+    max_retry = 5
+    t = 0.2
+    while retry < max_retry:
+        # time.sleep(t)
+        #阿里接口限流 
+        resp = dashscope.TextEmbedding.call(
+            model=dashscope.TextEmbedding.Models.text_embedding_v2,
+            input=input)
+        if resp.status_code == HTTPStatus.OK:
+            return resp
+        elif resp.status_code == 429:
+            logger.info(f'触发限流,等待{t}秒后重试')
+            retry += 1
+            t+=0.1
+        else:
+            logger.error(f'请求失败,状态码:{resp.status_code}')
+            return None
+    logger.error('重试超过上限')
+    return None
+
+
+
+#如果存在‘归属于|扣非’，就保留括号内的内容，并去掉标点符号和中文数字。
+#如果存在季度关键词，就将括号内容替换为季度
+#如果存在‘±’，就将括号内容替换为同期增减
+#其他情况，就删掉括号内全部内容
+def get_clean_text(text):
+    text = text.replace('流动资产：','').replace('半年度','上半年')
+    #先对几个半年报的词做整理，防止向量识别不出来
+    terms = ["货币资金", "应收账款",'应付账款']
+    #这个是不要合计的
+    terms_2 = ["固定资产","短期借款","合同负债","在建工程","商誉","存货"]
+    #这个是需要调换位置的指标
+    #terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"]
+    #不可以出现同比之类的
+    terms_4 = ['比', '率', '占','至','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资']
+    dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
+    #dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"]
+    if any(term in text for term in terms_4):
+        return text
+    if len(text) <= 20:
+        for term in terms:
+            for date in dates:
+                if term in text and date in text:
+                    text = f"{date}{term}合计"
+                    return text
+    if len(text) <= 20:
+        for term in terms_2:
+            for date in dates:
+                if term in text and date in text:
+                    text = f"{date}{term}"
+                    return text
+
+    import re
+    replacement_dict = {
+        '加：': '',
+        '减：': '',
+        '%' : '',
+        '其中：': '',
+        '实际': '',
+        '/': '',
+        '重述后':'',
+        '年末金额':'年末',
+        '比重增减':'同比增减',
+        '比例':'同比',
+    }
+    #针对整个text做替换
+    def replace_all(text, replacements):
+        pattern = re.compile("|".join(map(re.escape, replacements.keys())))
+        return pattern.sub(lambda match: replacements[match.group(0)], text)
+    text = replace_all(text, replacement_dict)
+    #单独出现12月31日时，就剔除掉
+    pattern_year = r'(?<!2025年|2024年|2023年|2022年|2021年)12月31日'
+    text = re.sub(pattern_year, '', text)
+
+    pattern = r"\（[^）]*\）|\([^)]*\)"  # 增加英文括号的匹配
+    matches = re.findall(pattern, text)
+    quarter_keywords = {
+        "1-3月": "第一季度",
+        "第1季度": "第一季度",
+        "4-6月": "第二季度",
+        "第2季度": "第二季度",
+        "7-9月": "第三季度",
+        "第3季度": "第三季度",
+        "10-12月": "第四季度",
+        "第4季度": "第四季度",
+        "调整后": "调整后",
+        "增减":"增减",
+        "一": "",
+        "二": "",
+        "三": "",
+        "年内到期":"年内到期",
+        "1－6月":"",
+        "发行新股":"发行新股",
+    }
+    #针对text的括号内容进行识别判断
+    for match in matches:
+        month_keywords_found = re.search(r"归属于|扣非", match)
+        if not month_keywords_found:  # 改为不包含时的处理
+            replaced = False
+            for keyword, replacement in quarter_keywords.items():
+                if re.search(keyword, match):
+                    text = re.sub(re.escape(match), replacement, text)  #触发关键词替换
+                    replaced = True
+                    break
+            if not replaced:
+                text = re.sub(re.escape(match), "", text)  # 如果没有找到匹配的关键词，直接删除
+        else:# 如果包含特殊关键词，删除整个括号内容
+            text = re.sub(r"[^\w\s]", "", text)
+    return text
+
+def convert_docx_to_pdf(file_path):
+    # 检查文件是否为 .docx 格式
+    if file_path.lower().endswith('.docx'):
+        # 生成 PDF 文件路径
+        pdf_path = os.path.splitext(file_path)[0] + '.pdf'
+        
+        try:
+            # 执行转换
+            convert(file_path, pdf_path)
+            logger.info(f"转换成功: {pdf_path}")
+        except Exception as e:
+            logger.error(f"转换失败: {e}")
+    else:
+        logger.error("错误: 文件必须是 .docx 格式。")
+
+def save_pdf_from_url(url, file_path):
+    from urllib.parse import unquote
+    # 发起 GET 请求并保存文件
+    response = requests.get(url)
+    local_file_path = ''
+    url = unquote(url)
+    # 检查响应状态码
+    if response.status_code == 200:
+        # 文件下载成功
+        url_without_params = url.split('?')[0]
+        # 从处理后的URL中提取文件名
+        # 提取文件名
+        file_name = url_without_params.split('/')[-1]
+        #https://financial-report-test.obs.cn-east-3.myhuaweicloud.com:443/upload/file/909f3dd3337a4dd4bc24fb4748c6c76e.PDF?AccessKeyId=IIDIMIUZ1UBBVPKIVB4W&Expires=1726798358&Signature=fKgrDPjmd99Nje4wwvBJxmFlXZY%3D
+        # 指定本地文件保存路径
+        local_file_path = file_path + file_name
+        # local_file_path = convert_docx_to_pdf(local_file_path)
+
+        with open(local_file_path, 'wb') as file:
+            file.write(response.content)
+            logger.info(f"文件已下载到 {local_file_path}")
+    else:
+        # 文件下载失败
+        logger.error(f"无法下载文件，状态码：{response.status_code}")
+
+    return local_file_path
+
+def get_range(count,parts_num):
+    # 获取 CPU 核数
+    if count < parts_num:
+        parts_num = count
+    # 使用 divmod() 函数计算除法结果和余数
+    quotient, remainder = divmod(count, parts_num)
+    count_range_parts = []
+    for i in range(parts_num):
+        start_num = i * quotient
+        if i < parts_num-1:
+            start_num = i * quotient
+            end_num = start_num+quotient
+        else:
+            end_num = count
+        count_range_parts.append(f'{start_num}-{end_num}')
+    return count_range_parts
+
+def cosine_similarity(vector_a, vector_b):
+    # 将向量转换为 NumPy 数组
+    vector_a = np.array(vector_a)
+    vector_b = np.array(vector_b)
+    
+    # 计算两个向量的点积
+    dot_product = np.dot(vector_a, vector_b)
+    
+    # 计算两个向量的欧几里得范数
+    norm_a = np.linalg.norm(vector_a)
+    norm_b = np.linalg.norm(vector_b)
+    
+    # 计算余弦相似度
+    cosine_sim = dot_product / (norm_a * norm_b)
+    
+    return cosine_sim
+
+def get_period_type(text, year):
+    l_year = f'{int(year)-1}'
+    bl_year = f'{int(year)-2}'
+    c_period = f'当期|本期|本报告期|报告期|本年|本期|{year}'
+    l_period = f'上年|上期|上年度|{l_year}'
+    bl_period = f'前年|{bl_year}'
+
+    if len(re.findall(c_period, text)) > 0:
+        return 'c'
+    elif len(re.findall(l_period, text)) > 0:
+        return 'l'
+    elif len(re.findall(bl_period, text)) > 0:
+        return 'bl'
+    else:
+        return 'c'
+
+def get_period_type_other(text, year):
+    l_year = f'{int(year)-1}'
+    bl_year = f'{int(year)-2}'
+    c_period = f'当期|本期|本报告期|报告期|本年|本期|{year}'
+    l_period = f'上年|上期|上年度|{l_year}'
+    bl_period = f'前年|{bl_year}'
+
+    if len(re.findall(c_period, text)) > 0:
+        return 'c'
+    elif len(re.findall(l_period, text)) > 0:
+        return 'l'
+    elif len(re.findall(bl_period, text)) > 0:
+        return 'bl'
+    else:
+        return 'c_n'
+
+def get_start_period_type(text):
+    s_period = '期初|1月1日|年初'
+
+    if len(re.findall(s_period, text)) > 0:
+        return ''
+    else:
+        return '0'
+
+def get_season_flag(text):
+    season_period = '第1季度|第2季度|第3季度|第4季度|一季度|二季度|三季度|四季度|1-3月|4-6月|7-9月|10-12月'
+    if len(re.findall(season_period, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_percent_flag(text):
+    percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
+    if len(re.findall(percent_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_kf_flag(text):
+    kf_word = '扣非|扣除非经常性损益'
+    if len(re.findall(kf_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_report_start(text):
+    kf_word = '报告期初|1月1日'
+    if len(re.findall(kf_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_percent_growth(text):
+    percent_growth_word = '变动|本年比上年|比例同比增减|比例同比上升|比例同比下降|比例变化幅度|比例变动比例|比例本期比上年同期增减|比例本年比上年增减|比例同比变动|比例本期期末金额较上期期末变动比例|比率同比增减|比率同比上升|比率同比下降|比率变化幅度|比率变动比例|比率本期比上年同期增减|比率本年比上年增减|比率同比变动|比率本期期末金额较上期期末变动比例|占比同比增减|占比同比上升|占比同比下降|占比变化幅度|占比变动比例|占比本期比上年同期增减|占比本年比上年增减|占比同比变动|占比本期期末金额较上期期末变动比例|费用同比增减|费用同比上升|费用同比下降|费用变化幅度|费用变动比例|费用本期比上年同期增减|费用本年比上年增减|费用同比变动|费用本期期末金额较上期期末变动比例'
+    if len(re.findall(percent_growth_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+def check_black_list(meta_measure, pdf_measure, black_array):
+    # 获取黑名单数据
+    #black_array = fetch_black_list_data(cursor)
+
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure==black_meta:
+            for pdf in black_pdfs:
+                if pdf_measure.find(pdf) >= 0:
+                    return True
+    return False
+
+def check_black_list_old(meta_measure,pdf_measure):
+    # 判断指标名是否包含黑名单词
+    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
+    # current_period = f'当期:{report_year}年1-6月'
+    # black_array.append(current_period)
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure.find(black_meta) >= 0:
+            for pdf in black_pdfs:
+                if pdf_measure.find(pdf) >= 0:
+                    return True
+    return False
+def check_white_list(meta_measure,pdf_measure):
+    white_array = ['基本每股收益:每股收益','加权平均净资产收益率同比变动:比','季度变动比例:比']
+    for black in white_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure.find(black_meta) >= 0:
+            for pdf in black_pdfs:
+                if pdf_measure.find(pdf) < 0:
+                    return True
+    return False
+
+def check_title_black_list(meta_measure,text_info):
+    # 判断指标名是否包含黑名单词
+    black_array = ['营业收入:前五名,前5名,合计','营业成本:合计','财务费用:现金流','销售费用:现金流','管理费用:现金流','研发费用:现金流','非经常性损益:合计']
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure.find(black_meta) >= 0:
+            for pdf in black_pdfs:
+                if text_info.find(pdf) >= 0:
+                    return True
+    return False
+
+# 文本中数字的占比
+def under_non_alpha_ratio(text: str, threshold: float = 0.6):
+    
+        if len(text) == 0:
+            return False
+    
+        alpha_count = len([char for char in text if char.strip() and char.isalpha()])
+        total_count = len([char for char in text if char.strip()])
+        try:
+            ratio = alpha_count / total_count
+            return ratio <= threshold
+        except:
+            return False
+def check_table_title_black_list(text,table_title_black_list):#report_year
+    #previous_year = int(report_year) - 1
+    if table_title_black_list is None:
+        return False
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+#通过关键词黑名单匹配表格上方的文本区域，提取需要过滤的表格
+def check_table_title_black_list_old(text,report_year):#report_year
+    previous_year = int(report_year) - 1
+    table_title_black_list = f"""所有权或使用权受到限制的资产|持有待售资产|关联交易|未确认递延所得税资产明细|{previous_year}年度|{previous_year}年1-6月|自{previous_year}年1月1日至6月30日止期间|流动性风险|关联交易|账龄超过|流动风险|公司资产负债表|按账龄组合|线上直营|线上直销|公司现金流量表|公司利润表|应收账款|在建工程|固定资产|其他与筹资活动有关的现金|汇率风险|市场风险|主营业务收入|主营收入|其他收入|前五名|前5名|经营活动有关的现金|股份变动对最近一年和最近一期每股收益、每股净资产等财务指标的影响|合同产生的收入情况|子公司|参股公司|控股公司|分解信息|经营活动产生的现金|行业分类|产品分类|地区分类|业绩快报|销售渠道|调整情况说明|合同分类|计入当期损益的政府补助|股份变动对最近一年和最近一期|分部的财务信息|显示服务创收|线上销售情况|试运行销售|会计政策变更|品牌经营业务|工程施工业务|开发业务|制造业务|合营安排或联营企业中的权益|联营企业的主要财务信息|汇率及通货膨胀|与金融工具相关的风险|运营业务|B端业务|终止经营现金流量|终止经营|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|母公司|现金流量表补充|直营店店效情况|担保人2023年度未经审计的|外汇风险|公司各业务板块经营情况|报告期确认的包括在合同负债期初账面价值中的收入|资产受限情况|资产权利受限情况|内控自我评价报告|所有权或使用权受限资产|合并日被合并方资产、负债的账面价值|经营租赁资产|前5|前五|②|不属于现金及现金等价物的货币资金|按销售模式分|按产品类别分|按照销售区域|产品类别|销售模式|经销模式|关键管理人员|截至{previous_year}年6月30日止六个月期间|关联方提供的存款及贷款服务|报告期内各销售渠道的盈利情况|报告期内各地区的盈利情况|报告期内各产品的盈利情况|其他非流动负债|关联方提供的存款及贷款服务|自营销售分商品类别数据|组合计提|考核指标|不属于现金及现金等价物的货币资金|应收款项融资|本期计提、收回或转回的坏账准备情况|存货跌价准备|持有待售负债"""
+    
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+#通过关键词黑名单匹配页面下方的文本区域，提取需要过滤的表格
+
+def check_table_title_black_list_button(text,table_title_black_list):
+
+    if table_title_black_list is None:
+        return False
+
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+def check_table_title_black_list_button_old(text):
+
+    table_title_black_list = """公司资产负债表|公司现金流量表|公司利润表|主营业务收入|主营收入|其他收入|前五名|前5名|经营活动有关的现金|股份变动对最近一年和最近一期每股收益、每股净资产等财务指标的影响|合同产生的收入情况|子公司|参股公司|控股公司|分解信息|经营活动产生的现金|2022年度|行业分类|产品分类|地区分类|业绩快报|销售渠道|调整情况说明|合同分类|计入当期损益政府补助|股份变动对最近一年和最近一期|分部的财务信息|显示服务创收|线上销售情况|试运行销售|品牌经营业务|工程施工业务|开发业务|制造业务|合营安排或联营企业中的权益|联营企业的主要财务信息|汇率及通货膨胀|与金融工具相关的风险|运营业务|B端业务|终止经营现金流量|终止经营|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|不属于现金及现金等价物的货币资金|经营租赁资产|分地区|分产品|分行业|使用权受限资产|资产受限情况|经销模式|持续的第三层次公允价值计量项目，期初与期末账面价值间的调节信息及不可观察参数敏感|权利受限情况|应收款项融资|本期计提、收回或转回的坏账准备情况"""
+    
+    
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+def check_table_title_black_list_measure(text):
+    #black_array = ['补充资料:研发费用,管理费用,财务费用'
+                #    ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
+                   #]
+    table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产|计息金融工具|坏账准备"""
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    return False
+#过滤原始指标中包含黑名单
+def check_pdf_measure_black_list(text):
+    pdf_measure_black_list = '股权变动前|股权变动后|含股份支付|境内|境外|调整前|有限公司|责任公司|其他|变更前|差异|同口径|调整金额'
+    if len(re.findall(pdf_measure_black_list, text)) > 0:
+        return True
+    if "其中：营业收入" in text:
+        return False
+    if "同比" in text and "额" in text:
+        #if text.find("同比") < text.find("额"):
+        if text.endswith("额"):
+            return True
+    return False
+
+
+def check_pdf_measure(pdf_measure):
+    keywords_1 = [
+        '2022年', '2023年', '2021年', '第一季度', '第二季度', '第三季度', '第四季度', '增减', '变动', '本期','同期', '当期', '报告期', '前年',
+        '上年', '上期', '本年', '1-3月', '4-6月', '7-9月', '10-12月'
+    ]
+    
+    keywords_2 = ['这里是一个测试文本']
+    
+    contain_keyword_1 = any(keyword in pdf_measure for keyword in keywords_1)
+    contain_keyword_2 = any(keyword in pdf_measure for keyword in keywords_2)
+    #只有 未出现周期，同时出现了'调整后'才会删掉指标
+    if not contain_keyword_1 and contain_keyword_2:
+        return True
+    return False
+# def check_white_list(meta_measure,pdf_measure):
+#     # 判断指标名是否包含白名单词
+#     black_array = ['营业收入:营业外收入,主营业务,营业总收入,扣除','归母净利润:净资产,净利率,扣除','扣非净利润:净资产,净利率','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
+#     for black in black_array:
+#         black_meta = black.split(':')[0]
+#         black_pdfs = black.split(':')[1].split(',')
+#         if meta_measure.find(black_meta) >= 0:
+#             for pdf in black_pdfs:
+#                 if pdf_measure.find(pdf) >= 0:
+#                     return True
+#     return False
+def check_line_text(line_text):
+    if line_text == 'PAGE':
+        return False
+    if line_text == '（续）':
+        return False
+    if line_text.endswith('（续）'):
+        return False
+    if line_text.endswith("年度财务报表") and "有限公司" in line_text:
+        return False
+    if len(line_text) < 20 and line_text.endswith("有限公司"):
+        return False
+    substrings = [
+        '对内加快发展方式绿色转型、对外形成绿色生产和生活方式',
+        '可持续发展、创新发展；“8”是八大绿色行动',
+        '色新赋能、催生绿色新科技、筑牢绿色新支撑',
+        '接上表','续上表',
+    ]
+    for substring in substrings:
+        if substring in line_text:
+            return False
+    return True
+
+def pdf_text_flag(text : str):
+    if under_non_alpha_ratio(text) and len(text) < 25:
+        return True
+    
+    if len(text) < 5:
+        return True
+    
+    if not re.findall(',|，|。|、|（|）|:|：|;|；',text):
+        return True
+    
+    if text.find('适用') != -1 and text.find('不适用') != -1:
+        return True
+
+    if text.find('是') != -1 and text.find('否') != -1:
+        return True
+
+    return False
+
+def get_change_rate_flag(text):
+    percent_word = '同比增减|同比上升|同比下降|变化幅度|变动比例|本期比上年同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
+    if len(re.findall(percent_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
+    content_value = f"{table_num}_{table_index}"
+    measure_index_array = []
+    select_measure_index_query = '''
+         SELECT DISTINCT text FROM measure_parser_info_linetext WHERE file_id = %s AND type = 'measure_index' and content = %s
+    '''
+    cursor_app.execute(select_measure_index_query, (file_id,content_value,))
+    measure_index_records = cursor_app.fetchall()
+    for measure_index_record in measure_index_records:
+        measure_index_array.append(measure_index_record[0])
+    black_array = ['补充资料:研发费用,管理费用,财务费用,销售费用'
+                   ,'测试标题:测试指标'
+                   ,'其他非流动负债:合同负债'
+                   ,'应收款项融资:应收账款'
+                   ,'本期计提、收回或转回的坏账准备情况:应收账款'
+                   ,'筹资活动产生的各项负债变动情况:短期借款,长期借款'
+                   ,'持有待售资产:固定资产'
+                   ,'账龄超过 1 年或逾期的重要应付账款:应付账款'
+                   ,'经营租赁资产:固定资产'
+                   ,'计息金融工具:货币资金,短期借款,交易性金融资产'
+                   ,'坏账准备:应收账款'
+                   ]
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        #if measure_index_array.find(black_meta) >= 0:
+        #if black_meta in measure_index_array:
+        if any(black_meta in measure_index for measure_index in measure_index_array):
+            if any(pdf in pdf_measure for pdf in black_pdfs):
+            #for pdf in black_pdfs:
+                #if pdf in pdf_measure:
+                #if pdf_measure.find(pdf) >= 0:
+                return True
+    return False
+def check_black_table_list(data):
+    black_array = ['补充资料:研发费用,管理费用,财务费用,销售费用',
+                   #'补充目录:母公司'
+                   ]
+    for black in black_array:
+            black_meta = black.split(':')[0]
+            black_pdfs = black.split(':')[1].split(',')
+            if any(black_meta in cell for row in data for cell in row):
+                logger.debug(data)
+                for pdf in black_pdfs:
+                    data = [row for row in data if not any(pdf in cell for cell in row)]
+    return data
+
+if __name__ == '__main__':
+
+    logger.debug(len('我是我'))
+
+    # logger.debug(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
+    # title = '母公司财务报表主要项目注释'
+    # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
+    #     logger.debug('1')
+    # else:
+    #     logger.debug('0')
+
+    # logger.debug(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
+    # test = '2023年1-12月'
+    # logger.debug(get_period_type('上年度本期费用化研发投入'))
+    # logger.debug(get_period_type('费用化研发投入本年度'))
+    # vector_a = embed_with_str('第一季度营业收入')
+    # vector = vector_a.output["embeddings"][0]["embedding"]
+
+    # vector_b = embed_with_str('营业收入第一季度')
+    # vector1 = vector_b.output["embeddings"][0]["embedding"]
+
+    # similarity = cosine_similarity(vector, vector1)
+    # logger.debug(f"余弦相似度: {similarity}")
+
+    # measure_data = [
+    #     '1,1,营业收入2023年金额,1003535799.51',
+    #     '1,1,营业收入2022年金额,869401513.71',
+    #     '1,1,营业收入变动比例,15.43%',
+    #     '1,1,营业成本2023年金额,810779075.89',
+    #     '1,1,营业成本2023年占营业收入的比重,80.79%',
+    #     '1,1,营业成本2022年金额,702990363.57',
+    #     '1,1,营业成本2022年占营业收入的比重,80.86%',
+    #     '1,1,营业成本变动比例,15.33%',
+    #     '1,1,毛利率2023年金额,19.21%',
+    #     '1,1,毛利率2022年金额,19.14%',
+    #     '1,1,销售费用2023年金额,34065464.60',
+    #     '1,1,销售费用2023年占营业收入的比重,3.39%',
+    #     '1,1,销售费用2022年金额,28038106.19',
+    #     '1,1,销售费用2022年占营业收入的比重,3.22%',
+    #     '1,1,销售费用变动比例,21.50%',
+    #     '1,1,管理费用2023年金额,50807308.69',
+    #     '1,1,管理费用2023年占营业收入的比重,5.06%',
+    #     '1,1,管理费用2022年金额,38251704.48',
+    #     '1,1,管理费用2022年占营业收入的比重,4.40%',
+    #     '1,1,管理费用变动比例,32.82%',
+    #     '1,1,研发费用2023年金额,35312198.23',
+    #     '1,1,研发费用2023年占营业收入的比重,3.52%',
+    #     '1,1,研发费用2022年金额,30081787.99',
+    #     '1,1,研发费用2022年占营业收入的比重,3.46%',
+    #     '1,1,研发费用变动比例,17.39%',
+    #     '1,1,财务费用2023年金额,8015604.52',
+    #     '1,1,财务费用2023年占营业收入的比重,0.80%',
+    #     '1,1,财务费用2022年金额,5739677.85',
+    #     '1,1,财务费用2022年占营业收入的比重,0.66%',
+    #     '1,1,财务费用变动比例,39.65%',
+    #     '1,1,信用减值损失2023年金额,-11873626.82',
+    #     '1,1,信用减值损失2023年占营业收入的比重,-1.18%',
+    #     '1,1,信用减值损失2022年金额,-8903293.61',
+    #     '1,1,信用减值损失2022年占营业收入的比重,-1.02%',
+    #     '1,1,信用减值损失变动比例,33.36%',
+    #     '1,1,资产减值损失2023年金额,-2328729.46',
+    #     '1,1,资产减值损失2023年占营业收入的比重,-0.23%',
+    #     '1,1,资产减值损失2022年金额,-2285987.53',
+    #     '1,1,资产减值损失2022年占营业收入的比重,-0.26%',
+    #     '1,1,资产减值损失变动比例,1.87%',
+    #     '1,1,其他收益2023年金额,17886048.88',
+    #     '1,1,其他收益2023年占营业收入的比重,1.78%',
+    #     '1,1,其他收益2022年金额,11025908.32',
+    #     '1,1,其他收益2022年占营业收入的比重,1.27%',
+    #     '1,1,其他收益变动比例,62.22%',
+    #     '1,1,投资收益2023年金额,323361.47',
+    #     '1,1,投资收益2023年占营业收入的比重,0.03%',
+    #     '1,1,投资收益2022年金额,1119730.43',
+    #     '1,1,投资收益2022年占营业收入的比重,0.13%',
+    #     '1,1,投资收益变动比例,-71.12%',
+    #     '1,1,公允价值变动收益2023年占营业收入的比重,0.00%',
+    #     '1,1,公允价值变动收益2022年金额,10183.62',
+    #     '1,1,公允价值变动收益2022年占营业收入的比重,0.00%',
+    #     '1,1,公允价值变动收益变动比例,-100.00%',
+    #     '1,1,资产处置收益2023年金额,12782544.48',
+    #     '1,1,资产处置收益2023年占营业收入的比重,1.27%',
+    #     '1,1,资产处置收益2022年金额,-59.56',
+    #     '1,1,资产处置收益2022年占营业收入的比重,0.00%',
+    #     '1,1,资产处置收益变动比例,21461726.06%',
+    #     '1,1,汇兑收益2023年金额,0',
+    #     '1,1,汇兑收益2023年占营业收入的比重,0%',
+    #     '1,1,汇兑收益2022年金额,0',
+    #     '1,1,汇兑收益2022年占营业收入的比重,0%',
+    #     '1,1,汇兑收益变动比例,0%',
+    #     '1,1,营业利润2023年金额,76175407.00',
+    #     '1,1,营业利润2023年占营业收入的比重,7.59%',
+    #     '1,1,营业利润2022年金额,63332601.81',
+    #     '1,1,营业利润2022年占营业收入的比重,7.28%',
+    #     '1,1,营业利润变动比例,20.28%',
+    #     '1,1,营业外收入2023年金额,5788307.99',
+    #     '1,1,营业外收入2023年占营业收入的比重,0.58%',
+    #     '1,1,营业外收入2022年金额,1083997.19',
+    #     '1,1,营业外收入2022年占营业收入的比重,0.12%',
+    #     '1,1,营业外收入变动比例,433.98%',
+    #     '1,1,营业外支出2023年金额,687271.68',
+    #     '1,1,营业外支出2023年占营业收入的比重,0.07%',
+    #     '1,1,营业外支出2022年金额,1554243.54',
+    #     '1,1,营业外支出2022年占营业收入的比重,0.18%',
+    #     '1,1,营业外支出变动比例,-55.78%',
+    #     '1,1,净利润2023年金额,72975283.09',
+    #     '1,1,净利润2023年占营业收入的比重,7.27%',
+    #     '1,1,净利润2022年金额,57747603.98',
+    #     '1,1,净利润2022年占营业收入的比重,6.64%',
+    #     '1,1,净利润变动比例,26.37%',
+    #     '1,1,税金及附加2023年金额,5170339.13',
+    #     '1,1,税金及附加2023年占营业收入的比重,0.52%',
+    #     '1,1,税金及附加2022年金额,1933753.49',
+    #     '1,1,税金及附加2022年占营业收入的比重,0.22%',
+    #     '1,1,税金及附加变动比例,167.37%',
+    #     '1,1,所得税费用2023年金额,8301160.22',
+    #     '1,1,所得税费用2023年占营业收入的比重,0.83%',
+    #     '1,1,所得税费用2022年金额,5114751.48',
+    #     '1,1,所得税费用2022年占营业收入的比重,0.59%',
+    #     '1,1,所得税费用变动比例,62.30%',
+    #     '1,1,少数股东损益2023年金额,-58350.22',
+    #     '1,1,少数股东损益2023年占营业收入的比重,-0.01%',
+    #     '1,1,少数股东损益2022年金额,-946.60',
+    #     '1,1,少数股东损益2022年占营业收入的比重,0.00%',
+    #     '1,1,少数股东损益变动比例,-6064.19%',
+    #     '1,1,归属于母公司所有者的净利润2023年金额,73033633.31',
+    #     '1,1,归属于母公司所有者的净利润2023年占营业收入的比重,7.28%',
+    #     '1,1,归属于母公司所有者的净利润2022年金额,57748550.58',
+    #     '1,1,归属于母公司所有者的净利润2022年占营业收入的比重,6.64%',
+    #     '1,1,归属于母公司所有者的净利润变动比例,26.47%',
+    #     '1,1,归属于少数股东的综合收益总额2023年金额,-58350.22',
+    #     '1,1,归属于少数股东的综合收益总额2023年占营业收入的比重,-0.01%',
+    #     '1,1,归属于少数股东的综合收益总额2022年金额,-946.60',
+    #     '1,1,归属于少数股东的综合收益总额2022年占营业收入的比重,0.00%',
+    #     '1,1,归属于少数股东的综合收益总额变动比例,-6064.19%',
+    #     '1,1,归属于母公司所有者的综合收益总额2023年金额,73033633.31',
+    #     '1,1,归属于母公司所有者的综合收益总额2023年占营业收入的比重,7.28%',
+    #     '1,1,归属于母公司所有者的综合收益总额2022年金额,57748550.58',
+    #     '1,1,归属于母公司所有者的综合收益总额2022年占营业收入的比重,6.64%',
+    #     '1,1,归属于母公司所有者的综合收益总额变动比例,26.47%',
+    #     '2,1,主营业务收入2023年,983698831.48',
+    #     '2,1,主营业务收入2022年,854682261.31',
+    #     '2,1,主营业务收入变动比例,15.10%',
+    #     '2,1,其他业务收入2023年,19836968.03',
+    #     '2,1,其他业务收入2022年,14719252.40',
+    #     '2,1,其他业务收入变动比例,34.77%',
+    #     '2,1,主营业务成本2023年,793604607.43',
+    #     '2,1,主营业务成本2022年,690932741.27',
+    #     '2,1,主营业务成本变动比例,14.86%',
+    #     '2,1,其他业务成本2023年,17174468.46',
+    #     '2,1,其他业务成本2022年,12057622.30',
+    #     '2,1,其他业务成本变动比例,42.44%',
+    #     '3,1,变压器营业收入,490028234.05',
+    #     '3,1,变压器营业成本,402179824.08',
+    #     '3,1,变压器毛利率,17.93%',
+    #     '3,1,变压器营业收入比上年同期增减,16.22%',
+    #     '3,1,变压器营业成本比上年同期增减,16.33%',
+    #     '3,1,变压器毛利率比上年同期增减,减少0.07个百分点',
+    #     '3,1,高低压成套开关设备营业收入,261342442.26',
+    #     '3,1,高低压成套开关设备营业成本,206645237.99',
+    #     '3,1,高低压成套开关设备毛利率,20.93%',
+    #     '3,1,高低压成套开关设备营业收入比上年同期增减,-8.93%',
+    #     '3,1,高低压成套开关设备营业成本比上年同期增减,-9.91%',
+    #     '3,1,高低压成套开关设备毛利率比上年同期增减,增加0.86个百分点',
+    #     '3,1,户外成套设备营业收入,198013248.27',
+    #     '3,1,户外成套设备营业成本,157856817.84',
+    #     '3,1,户外成套设备毛利率,20.28%',
+    #     '3,1,户外成套设备营业收入比上年同期增减,62.25%',
+    #     '3,1,户外成套设备营业成本比上年同期增减,65.30%',
+    #     '3,1,户外成套设备毛利率比上年同期增减,减少1.47个百分点',
+    #     '3,1,其他营业收入,54151874.93',
+    #     '3,1,其他营业成本,44097195.98',
+    #     '3,1,其他毛利率,18.57%',
+    #     '3,1,其他营业收入比上年同期增减,39.68%',
+    #     '3,1,其他营业成本比上年同期增减,36.10%',
+    #     '3,1,其他毛利率比上年同期增减,增加2.14个百分点',
+    #     '3,1,合计营业收入,1003535799.51',
+    #     '3,1,合计营业成本,810779075.89',
+    #     '3,2,东北地区营业收入,2425280.53',
+    #     '3,2,东北地区营业成本,1427939.37',
+    #     '3,2,东北地区毛利率,41.12%',
+    #     '3,2,东北地区营业收入比上年同期增减,-69.51%',
+    #     '3,2,东北地区营业成本比上年同期增减,-77.58%',
+    #     '3,2,东北地区毛利率比上年同期增减,增加21.20个百分点',
+    #     '3,2,华北地区营业收入,70542020.62',
+    #     '3,2,华北地区营业成本,53044055.18',
+    #     '3,2,华北地区毛利率,24.81%',
+    #     '3,2,华北地区营业收入比上年同期增减,205.32%',
+    #     '3,2,华北地区营业成本比上年同期增减,203.18%',
+    #     '3,2,华北地区毛利率比上年同期增减,增加0.54个百分点',
+    #     '3,2,华东地区营业收入,770352353.33',
+    #     '3,2,华东地区营业成本,636803535.34',
+    #     '3,2,华东地区毛利率,17.34%',
+    #     '3,2,华东地区营业收入比上年同期增减,24.17%',
+    #     '3,2,华东地区营业成本比上年同期增减,25.30%',
+    #     '3,2,华东地区毛利率比上年同期增减,减少0.74个百分点',
+    #     '3,2,华南地区营业收入,18509519.71',
+    #     '3,2,华南地区营业成本,14496855.46',
+    #     '3,2,华南地区毛利率,21.68%',
+    #     '3,2,华南地区营业收入比上年同期增减,-57.08%',
+    #     '3,2,华南地区营业成本比上年同期增减,-57.98%',
+    #     '3,2,华南地区毛利率比上年同期增减,增加1.67个百分点',
+    #     '3,2,华中地区营业收入,60588394.64',
+    #     '3,2,华中地区营业成本,44559969.21',
+    #     '3,2,华中地区毛利率,26.45%',
+    #     '3,2,华中地区营业收入比上年同期增减,-51.24%',
+    #     '3,2,华中地区营业成本比上年同期增减,-55.13%',
+    #     '3,2,华中地区毛利率比上年同期增减,增加6.38个百分点',
+    #     '3,2,西北地区营业收入,58618014.32',
+    #     '3,2,西北地区营业成本,42844719.81',
+    #     '3,2,西北地区毛利率,26.91%',
+    #     '3,2,西北地区营业收入比上年同期增减,178.59%',
+    #     '3,2,西北地区营业成本比上年同期增减,173.62%',
+    #     '3,2,西北地区毛利率比上年同期增减,增加1.33个百分点',
+    #     '3,2,西南地区营业收入,22500216.36',
+    #     '3,2,西南地区营业成本,17602001.52',
+    #     '3,2,西南地区毛利率,21.77%',
+    #     '3,2,西南地区营业收入比上年同期增减,-23.74%',
+    #     '3,2,西南地区营业成本比上年同期增减,-17.89%',
+    #     '3,2,西南地区毛利率比上年同期增减,减少5.57个百分点',
+    #     '3,2,合计营业收入,1003535799.51',
+    #     '3,2,合计营业成本,810779075.89',
+    #     '5,2,经营活动产生的现金流量净额2023年,-44713443.44',
+    #     '5,2,经营活动产生的现金流量净额2022年,-53241071.45',
+    #     '5,2,经营活动产生的现金流量净额变动比例,16.02%',
+    #     '5,2,投资活动产生的现金流量净额2023年,-88649920.50',
+    #     '5,2,投资活动产生的现金流量净额2022年,-94251741.15',
+    #     '5,2,投资活动产生的现金流量净额变动比例,5.94%',
+    #     '5,2,筹资活动产生的现金流量净额2023年,96607197.26',
+    #     '5,2,筹资活动产生的现金流量净额2022年,210537586.22',
+    #     '5,2,筹资活动产生的现金流量净额变动比例,-54.11%'
+    # ]
+    
+    # client = MilvusClient(
+    #     uri="http://localhost:19530"
+    # )
+    # vector_obj = embed_with_str('2023年营业收入')
+    # vector = vector_obj.output["embeddings"][0]["embedding"]
+
+    # vector_b = embed_with_str('营业收入第一季度')
+    # vector1 = vector_b.output["embeddings"][0]["embedding"]
+
+    # similarity = cosine_similarity(vector, vector1)
+    # logger.debug(f"余弦相似度: {similarity}")
+
+    # insert_measure_data(client, measure_data)
+    # text = '营业收入第一季度（1-3月份）'
+    # new_text = re.sub(r'（[^)]*）', '',text)
+    # logger.debug(new_text)
--- a/zzb_data_prod/wget-log
+++ b/zzb_data_prod/wget-log
@ -0,0 +1,3 @@
+--2024-12-27 11:22:17--  https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
+Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
+wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
--- a/zzb_data_word/Mil_unit.py
+++ b/zzb_data_word/Mil_unit.py
@ -0,0 +1,30 @@
+from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
+from config import MILVUS_CLIENT
+import time
+from datetime import datetime, timedelta
+
+def create_partition_by_hour(current_hour):
+    # 连接到 Milvus 服务器
+    connections.connect("default",uri=MILVUS_CLIENT)
+    # 获取集合
+    collection_name = "pdf_measure_v4"
+    collection = Collection(collection_name)
+
+    # 创建当前小时的分区
+    partition_name = f"partition_{current_hour}"
+    if not collection.has_partition(partition_name):
+        collection.create_partition(partition_name)
+        print(f"Created partition: {partition_name}")
+        partition = collection.partition(partition_name)
+        partition.load()
+
+    # 获取所有分区
+    partitions = collection.partitions
+    # 删除所有分区（除了默认分区和当前分区）
+    for partition in partitions:
+        name = partition.name
+        if name not in ["_default", partition_name]:  # 保留默认分区
+            pre_partition = collection.partition(name)
+            pre_partition.release()
+            collection.drop_partition(name)
+            print(f"Partition '{name}' deleted.")
--- a/zzb_data_word/pycache/config.cpython-310.pyc
+++ b/zzb_data_word/pycache/config.cpython-310.pyc
--- a/zzb_data_word/pycache/db_service_word.cpython-310.pyc
+++ b/zzb_data_word/pycache/db_service_word.cpython-310.pyc
--- a/zzb_data_word/pycache/main_word.cpython-310.pyc
+++ b/zzb_data_word/pycache/main_word.cpython-310.pyc
--- a/zzb_data_word/pycache/parse_word.cpython-310.pyc
+++ b/zzb_data_word/pycache/parse_word.cpython-310.pyc
--- a/zzb_data_word/pycache/redis_service.cpython-310.pyc
+++ b/zzb_data_word/pycache/redis_service.cpython-310.pyc
--- a/zzb_data_word/pycache/utils.cpython-310.pyc
+++ b/zzb_data_word/pycache/utils.cpython-310.pyc
--- a/zzb_data_word/pycache/word_title.cpython-310.pyc
+++ b/zzb_data_word/pycache/word_title.cpython-310.pyc
--- a/zzb_data_word/pycache/zzb_logger.cpython-310.pyc
+++ b/zzb_data_word/pycache/zzb_logger.cpython-310.pyc
--- a/zzb_data_word/app_word.py
+++ b/zzb_data_word/app_word.py
@ -0,0 +1,225 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+import os
+import utils
+import queue
+from multiprocessing import Process
+import word_title
+import time
+import config
+import requests
+import threading
+from parse_word import parse_docx, split_text_table
+import json
+import db_service_word
+import main_word
+from zzb_logger import applog
+
+
+app = FastAPI()
+cpu_count = os.cpu_count()
+job_queue = queue.Queue()
+
+# 定义请求体模型
+class FileItem(BaseModel):
+    file_path: str
+    file_id: str
+
+def split_list(lst, n):
+    k, m = divmod(len(lst), n)
+    return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
+
+def run_job():
+    #判断是否有任务在执行
+    if_run = True
+    
+    if job_queue.empty():
+        applog.info(f"job_queue为空:")
+        if_run = False
+
+    if if_run:
+        job_config = job_queue.get()
+        file_path = job_config['file_path']
+        file_id = job_config['file_id']
+        continue_execution = True
+        try:
+
+            start_time = time.time()
+            applog.info(f"开始启动文件解析任务: {file_path}")
+            if file_path.startswith('http'):
+                file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
+            try:
+                time_dispatch_job = time.time()
+                # 通知开始解析 暂时不通知
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
+                applog.info(f'通知pdf开始解析url:{file_id}:{response.url}')
+                applog.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
+                parsed_content, catalog_content = parse_docx(file_path)  
+
+                json_parsed_content = json.loads(parsed_content)
+                json_catalog_content = json.loads(catalog_content)
+
+                db_service_word.word_title_insert_mysql(file_id, json_catalog_content)
+
+                parent_table_pages = word_title.get_parent_table_pages(json_catalog_content,file_id)
+
+                text_elements_json, table_elements_json = split_text_table(json_parsed_content)
+                #
+                processes = []
+                text_list = split_list(json.loads(text_elements_json), cpu_count)
+                applog.info(f'text，任务ID:{file_id}')
+                for job_info in text_list:
+                    p = Process(target=main_word.process_text_content, args=(file_id, job_info,json.loads(table_elements_json),json.loads(text_elements_json)))
+                    processes.append(p)
+                    p.start()
+                applog.info(f'等待所有子任务完成，任务ID:{file_id}')
+                for p in processes:
+                    p.join()
+                applog.info(f'word表格中 text解析完成，任务ID:{file_id}',)
+
+                processes = []
+                table_list = split_list(json.loads(table_elements_json), cpu_count)
+                applog.info(f'开始解析word表表格中的table，任务ID:{file_id}')
+                for job_info in table_list:
+                    p = Process(target=main_word.process_table, args=(file_id, job_info,))
+                    processes.append(p)
+                    p.start()
+                applog.info(f'等待所有子任务完成，任务ID:{file_id}' )
+                for p in processes:
+                    p.join()
+
+                # main_word.process_table(file_id, json.loads(table_elements_json))
+                applog.info(f'word表格中 table解析完成，任务ID:{file_id}')
+
+
+                time_dispatch_job_end = time.time()
+                process_time = time_dispatch_job_end - time_dispatch_job
+                db_service_word.process_time(file_id, '1', process_time, time_dispatch_job, time_dispatch_job_end)
+                parser_end_time = time.time()
+                applog.info(f"解析任务 {file_id} 完成，耗时{(parser_end_time - time_dispatch_job):.2f} 秒。")
+
+            except Exception as e:
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
+                applog.info(f'通知任务状态url:{file_id}:{response.url}')
+                applog.info(f'通知任务状态任务:{file_id}:{response.text}')
+                applog.info(f"{file_id}运行失败: {e}")
+                continue_execution = False
+            if continue_execution :
+                #这里做一步判断，看看是否还要继续。
+                if db_service_word.file_type_check(file_id):
+                    applog.info("文本较真表格生成已结束")
+                else:
+                    # 通知抽取指标---------------------------------
+                    response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
+                    applog.info(f'通知开始抽取指标url:{file_id}:{response.url}')
+                    applog.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
+
+                    parser_start_time = time.time()
+                    applog.info(f'开始表格指标抽取，任务ID:{file_id}')
+                    time_start = time.time()
+                    if db_service_word.file_type_check_v2(file_id) == 3 : #判断是否为3季报
+                        main_word.start_table_measure_job(file_id)
+                        #time_start_end = time.time()
+                        #process_time = time_start_end - time_start
+                        #db_service.process_time(file_id,'2',process_time)
+                        time_start_end = time.time()
+                        process_time = time_start_end - time_start
+                        db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end)
+                        applog.info(f'表格指标抽取完成，任务ID:{file_id}')
+                        parser_end_time = time.time()
+                        applog.info(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+
+                        applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
+                        time_update = time.time()
+                        main_word.update_measure_data(file_id,file_path,parent_table_pages)
+                        #time_update_end = time.time()
+                        #process_time = time_update_end - time_update
+                        #db_service.process_time(file_id,'3',process_time)
+                        applog.info(f'归一化完成任务ID:{file_id}')
+                        end_time = time.time()
+                        applog.info(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
+                        time_update_end = time.time()
+                        process_time = time_update_end - time_update
+                        db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end)
+                    else:#不是三季报就直接按照年报和半年报走
+                        main_word.start_table_measure_job(file_id)
+                    #time_start_end = time.time()
+                    #process_time = time_start_end - time_start
+                    #db_service.process_time(file_id,'2',process_time)
+                        time_start_end = time.time()
+                        process_time = time_start_end - time_start
+                        db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end)
+                        applog.info(f'表格指标抽取完成，任务ID:{file_id}' )
+                        parser_end_time = time.time()
+                        applog.info(f"表格指标抽取 {file_id} 完成，耗时{(parser_end_time - parser_start_time):.2f} 秒。")
+
+                        applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}' )
+                        time_update = time.time()
+                        main_word.update_measure_data(file_id,file_path,parent_table_pages)
+                    #time_update_end = time.time()
+                    #process_time = time_update_end - time_update
+                    #db_service.process_time(file_id,'3',process_time)
+                        applog.info(f'归一化完成任务ID:{file_id}')
+                        end_time = time.time()
+                        applog.info(f"任务 {file_id} 完成，耗时{(end_time - start_time):.2f} 秒。")
+                        time_update_end = time.time()
+                        process_time = time_update_end - time_update
+                        db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end)
+                #通知任务完成
+                response_time = time.time()
+
+                response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
+                applog.info(f'通知任务状态url:{file_id}:{response.url}')
+                applog.info(f'通知任务状态任务:{file_id}:{response.text}')
+
+                response_time_end = time.time()
+                process_time = response_time_end - response_time
+                db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end)
+        except Exception as e:
+            #通知任务完成
+            response_time = time.time()
+            response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
+            response_time_end = time.time()
+            process_time = response_time_end - response_time
+            db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end)
+            applog.info(f'通知任务状态url:{file_id}:{response.url}')
+            applog.info(f'通知任务状态任务:{file_id}:{response.text}')
+            applog.info(f"Response status code: {response.status_code}")
+            applog.info(f"{file_id}运行失败: {e}")
+        finally:
+            applog.info(f"任务 {file_id} 完成")
+
+    else:
+        applog.info("有任务运行中，需要等待.....")
+
+def parse_route(fileItem: FileItem):
+    # 创建一个队列,保证每次只执行一个文件解析任务
+    job_queue.put({
+        'file_path' : fileItem.file_path,
+        'file_id' : fileItem.file_id,
+        # 'type': fileItem.type
+    })
+    applog.info(f"增加 {fileItem.file_id} 到队列.")
+    threading.Thread(target=run_job, args=()).start()
+
+    return {"success": True, "msg": "文件解析开始"}
+
+app.post("/parser/start",
+        tags=["parser"],
+        summary="解析Pdf文件", 
+        )(parse_route)
+
+# 运行 FastAPI 应用
+if __name__ == "__main__":
+    # 服务器启动服务
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=config.PORT)
+    # 本地调试任务
+    # file_id = "201837"
+    # job_queue.put({
+    #  'file_path': '西部建设.docx',
+    #  'file_id': file_id,
+    #  })
+    # db_service_word.delete_database(file_id)
+    # run_job()
--- a/zzb_data_word/config.py
+++ b/zzb_data_word/config.py
@ -0,0 +1,23 @@
+MILVUS_CLIENT='http://127.0.0.1:19530'
+MILVUS_HOST = '127.0.0.1'
+MILVUS_PORT = 19530
+MYSQL_HOST = '10.127.2.207'
+MYSQL_PORT = 3306
+MYSQL_USER = 'financial_prod'
+MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
+MYSQL_DB = 'financial_report_prod'
+NOTIFY_ADDR = 'http://10.127.2.202:8100/api/tenant/report/notify'
+FILE_PATH = '/root/pdf_parser/word/'
+REDIS_HOST = '10.127.2.209'
+REDIS_PORT = 6379
+REDIS_PASSWORD = 'dMrt4kmwiW6LDJXy'
+PORT = 8001
+MEASURE_COUNT = 8
+
+
+MYSQL_HOST_APP = '10.127.2.207'
+MYSQL_PORT_APP = 3306
+MYSQL_USER_APP = 'financial_prod'
+MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
+MYSQL_DB_APP = 'financial_report_prod'
+api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
--- a/zzb_data_word/db_service_word.py
+++ b/zzb_data_word/db_service_word.py
--- a/zzb_data_word/main_word.py
+++ b/zzb_data_word/main_word.py
@ -0,0 +1,823 @@
+import re
+import os,time
+from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP
+import mysql.connector
+import utils
+from pymilvus import MilvusClient
+
+import numpy as np
+from multiprocessing import Process
+from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
+import redis
+import db_service_word
+from zzb_logger import applog
+
+
+
+'''
+已知发现问题：
+1.表格和文本提取错误，表格和文本内容在同一页，文本在前表格在后的，文本数据提取不出来
+2.大模型抽取错，抽取2023年营业收入：主营业务收入、分产品的营业收入、变动比例被错误抽取
+3.表格中的指标被抽取成文本中
+4.大模型抽取指标时，语义完全不同的指标被放一起，考虑用向量相似度来判断
+'''
+
+# 数据处理流程
+# 1. get_table_range多进程获取所有表格及表格上下文，输出为一个完整的列表
+# 2. 单进程进行表格分页合并，输出一个新的表格对象数组
+# 3. 新表格对象数组多进程开始原来的解析指标流程
+
+
+STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
+PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
+MUILT_PATTERN = '调整前'
+#unit_pattern = re.compile(r'单位[：|:]?(百万元|千万元|亿元|万元|千元|元)')
+unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则，不限制冒号，只限制距离
+#获取指标的表头信息
+def get_col_num_info(array,row_num,col_num,x,y):
+    num_info=""
+    for j in range(col_num):
+        if len(str(array[x][j])) > 50:
+            continue
+        num_info += str(array[x][j])
+    
+    return num_info.replace('%','')
+
+#获取指标的表头信息
+def get_row_num_info(array,row_num,col_num,x,y):
+    num_info=""
+
+    for i in range(row_num):
+        if len(str(array[i][y])) > 50:
+            continue
+        num_info += str(array[i][y])
+    
+    return num_info
+
+def table_converter(table):
+    table_string = ''
+    # 遍历表格的每一行
+    for row_num in range(len(table)):
+        row = table[row_num]
+        # 从warp的文字删除线路断路器
+        cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
+        # 将表格转换为字符串，注意'|'、'\n'
+        table_string+=(','.join(cleaned_row))
+    # 删除最后一个换行符
+    table_string = table_string[:-1]
+    return table_string
+
+# 检查第二列是否为中文字符的函数
+def is_chinese(s):
+    return bool(re.search('[\u4e00-\u9fff]', s))
+
+def check_table(arr):
+    split_index = None
+    for i in range(arr.shape[0]):
+        # 过滤掉第一行
+        if arr[i, 0] == "" and is_chinese(arr[i, 1]) and i > 1:
+            split_index = i
+            break
+    if split_index is not None:
+        arr1 = arr[:split_index]
+        arr2 = arr[split_index:]
+        return [arr1, arr2]
+    else:
+        return [arr]
+
+def safe_process_array(func, arr):
+    try:
+        return func(arr)
+    except Exception as e:
+        print(f"这个函数出现了报错{func.__name__}: {e}")
+        return arr  # 返回原数组以便继续后续处理
+
+
+# 单独针对三季报的资产负债表识别合并问题
+def process_array(arr, years=['2022', '2023', '2024'], keyword='项目'):
+    # 确保 row 有足够的列来存储分割后的数据
+    def ensure_columns(row, num_columns):
+        while len(row) < num_columns:
+            row.append('')
+
+    def is_valid_header(header, years, keyword):
+        header_text = header.lower()  # 转小写以提高匹配的鲁棒性
+        return any(year in header_text for year in years) and keyword in header_text
+
+    # 对字符串进行清理
+    def clean_text(text):
+        # 去除“年”和“月”相邻的空格
+        text = re.sub(r'\s*(年|月)\s*', r'\1', text)
+        # 去除“日”左侧相邻的空格
+        text = re.sub(r'\s*日', '日', text)
+        return text
+
+    # 将 numpy 数组转换为列表
+    arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
+
+    if len(arr[0]) == 1 and is_valid_header(arr[0][0], years, keyword):
+        remaining_value = arr[0][0]
+
+        # 清理字符串
+        remaining_value = clean_text(remaining_value)
+
+        parts = remaining_value.split()
+
+        ensure_columns(arr[0], len(parts))
+        for i in range(len(parts)):
+            arr[0][i] = parts[i]
+
+    header_columns = len(arr[0])
+
+    for i in range(1, len(arr)):
+        if len(arr[i]) == 1:
+            remaining_value = arr[i][0]
+            parts = remaining_value.split()
+            if len(parts) > header_columns:
+                parts = parts[:header_columns]
+            ensure_columns(arr[i], header_columns)
+            for j in range(len(parts)):
+                arr[i][j] = parts[j]
+            # 如果分割出的值不足，填充空值
+            if len(parts) < header_columns:
+                for j in range(len(parts), header_columns):
+                    arr[i][j] = ''
+
+    return arr
+
+
+# 三季报中针对性修改，本报告期和年初至报告期末的两个上年同期进行区分
+def process_array_with_annual_comparison(arr, keywords=['本报告期', '年初至报告期末', '上年同期']):
+    def contains_all_keywords(header, keywords):
+        return all(keyword in header for keyword in keywords)
+
+    def split_and_replace_occurrences(header, target, replacement):
+        # 找到所有 target 出现的位置
+        indices = [i for i, x in enumerate(header) if x == target]
+        if len(indices) > 1:
+            split_index = len(indices) // 2
+            for i in range(split_index):
+                header[indices[i]] = replacement
+        return header
+
+    # 将 numpy 数组转换为列表
+    arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
+
+    if len(arr) > 0 and len(arr[0]) > 0:
+        first_row = arr[0]
+
+        if contains_all_keywords(first_row, keywords):
+            # 将 "上年同期" 拆分并替换
+            first_row = split_and_replace_occurrences(first_row, '上年同期', '三季报中无需识别的上年同期')
+            arr[0] = first_row
+
+    return arr
+
+
+# 三季报的非经常损益的单独处理
+def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期'], target='计入当期损益的政府补助',
+                              replacement='非经常性损益'):
+    # 检查第一行是否包含所有关键词
+    def contains_all_keywords(header, keywords):
+        # return all(keyword in header for keyword in keywords)
+        return all(any(keyword in str(cell) for cell in header) for keyword in keywords)
+
+    # 检查第一列中是否存在目标文本
+    def contains_target_in_first_column(arr, target):
+        return any(target in str(item[0]) for item in arr)
+
+    # 替换第一列中的特定值
+    def replace_in_first_column(arr, target, replacement):
+        for i in range(len(arr)):
+            if arr[i][0] == target:
+                arr[i][0] = replacement
+        return arr
+
+    # 将 numpy 数组转换为列表
+    arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
+
+    if len(arr) > 0 and len(arr[0]) > 0:
+        first_row = arr[0]
+
+        # 检查第一行和第一列的条件
+        if contains_all_keywords(first_row, keywords) and contains_target_in_first_column(arr, target):
+            # 替换第一列中的 "合计"
+            arr = replace_in_first_column(arr, '合计', replacement)
+
+    return arr
+
+# 处理表格数据
+def process_table(file_id, tables):
+    applog.info('Run task %s (%s)...' % (f'处理word文件中的table file_id:{file_id}', os.getpid()))
+    start = time.time()
+
+    conn = mysql.connector.connect(
+        host=MYSQL_HOST,
+        user=MYSQL_USER,
+        password=MYSQL_PASSWORD,
+        database=MYSQL_DB
+    )
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor(buffered=True)
+
+    for t in tables:
+        try:
+            arr = np.array(t["data"])
+
+            arr = safe_process_array(process_array, arr)  # 部分资产负债表合并问题
+            arr = safe_process_array(process_array_with_annual_comparison, arr)  # 复杂表格的优化"多个上年同期时处理"
+            arr = safe_process_array(process_array_with_grants, arr)  # 三季报的非经常损益
+
+            arr = np.char.replace(arr, ' ', '')
+            arr = np.char.replace(arr, '\n', '')
+            arr = np.char.replace(arr, ',', '')
+
+            arr_list = check_table(arr)
+
+            for a in arr_list:
+                new_data = a.tolist()  # 用于后面保存到数据库中
+                new_data = utils.check_black_table_list(new_data)
+                rows, cols = a.shape
+                if rows == 1 and cols == 1:
+                    continue
+                arr_str = ''.join([''.join(map(str, row)) for row in a])
+                # 全量的数据先存入 word_parse_data表中
+                db_service_word.insert_word_parse_process({
+                    'file_id': file_id,
+                    'page_num': t["index"],
+                    'page_count': 100,
+                    'type': 'table',
+                    'content': {
+                        'page_num': t["index"],
+                        'table_index': t["index"],
+                        "type": "table",
+                        "data": new_data,
+                    }}, conn, cursor, "word_parse_data")
+
+                # 过滤掉不包含需抽取指标表格的文本
+                matches = re.findall(STR_PATTERN, arr_str)
+                pattern = re.findall(PATTERN, arr_str)
+                muilt_pattern = re.findall(MUILT_PATTERN, arr_str)
+
+                if len(matches) > 0 and len(muilt_pattern) < 5:
+                # if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern) < 5:
+                    db_service_word.insert_word_parse_process({
+                        'file_id': file_id,
+                        'page_num': t["index"],
+                        'page_count': 100,
+                        'type': 'parse_table',
+                        'content': {
+                            'page_num': t["index"],
+                            'table_index': t["index"],
+                            "type": "table",
+                            "data": new_data,
+                        }}, conn, cursor,"word_parse_process")
+        except Exception as e:
+            applog.info(f'解析表格时出现了异常 {e} 内容为{t}')
+    cursor.close()
+    conn.close()
+    end = time.time()
+    applog.info('Task %s runs %0.2f seconds.' % (f'解析表格{file_id}', (end - start)))
+
+def text_in_table(top, tables_range, page_num):
+    if tables_range.get(page_num):
+        for range in tables_range[page_num]:
+            if top < range['top'] and top > range['buttom']:
+                return True
+    return False
+
+def get_text_type(text: str):
+    text = re.sub(r"\s", "", text)
+    first_re = '年度报告'
+    page_number_pattern = re.compile(r'^\d+(/\d+)?$')
+    
+    if re.search(first_re, text.strip()):
+         return 'page_header'
+    
+    if page_number_pattern.match(text.strip()):
+        return 'page_footer'
+    
+    if len(text) < 20 and text.endswith('页'):
+        return 'page_footer'
+    
+    return 'text'
+
+def check_report_type(file_id):
+    conn = mysql.connector.connect(
+        host=MYSQL_HOST,
+        user=MYSQL_USER,
+        password=MYSQL_PASSWORD,
+        database=MYSQL_DB
+    )
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor(buffered=True)
+    """
+    :return: 返回pdf文件中文本内容，不包括表格
+    """
+    select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
+    cursor.execute(select_year_select)
+    record_select = cursor.fetchall()
+    if record_select:
+        report_type = record_select[0][0]
+        report_year = record_select[0][1]
+        cursor.close()
+        conn.close()
+        return int(report_type),report_year
+    else:
+        return None
+
+
+
+# 通过text的index 获取最近的一个table的index,并校验中间text文本的长度和数量
+def get_next_table_index(text_index, texts, tables):
+    try:
+        for table in tables:
+            if table["index"] > text_index and table["type"] == "table":
+                table_index = table["index"]
+                total_len = sum(len(texts.get(key).get("data").replace(" " ,"")) for key in range(text_index + 1, table_index))
+                # 最近一个表格的索引 在10个以内
+                if (table_index - text_index) < 10 and total_len < 50:
+                    # 在判断所有的字符串加起来有是否小于50个字
+                    return table_index
+                else:
+                    return text_index
+    except StopIteration:
+        applog.error("Target not found")
+        return text_index
+
+
+#处理文本数据
+def process_text_content(file_id,texts,tables,full_texts,type =0):
+    applog.info('Run task %s (%s)...' % (f'处理word文件中的 text file_id:{file_id}', os.getpid()))
+    conn = mysql.connector.connect(
+        host=MYSQL_HOST,
+        user=MYSQL_USER,
+        password=MYSQL_PASSWORD,
+        database=MYSQL_DB
+    )
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor(buffered=True)
+    """
+    :return: 返回pdf文件中文本内容，不包括表格
+    """
+    report_type, report_year = check_report_type(file_id)
+    texts_dict = {t["index"]:t for t in full_texts}
+
+    query = "SELECT title_list,button_list FROM table_title_list WHERE report_year = %s"
+    cursor_dict = conn.cursor(dictionary=True)  
+    cursor_dict.execute(query, (report_year,))
+    result = cursor_dict.fetchone()
+    title_list = result['title_list']
+    button_list = result['button_list']
+
+    try:
+        for t in texts:
+            line_text = t["data"]
+            line_text = re.sub(r"\s", "", line_text)
+            line_text = re.sub(r"：", ":", line_text)
+            index = t["index"]
+
+            if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
+                db_service_word.insert_measure_parser_info({
+                    'file_id': file_id,
+                    'content': get_next_table_index(index,texts_dict,tables),
+                    'type': 'parent_com',
+                }, conn, cursor)
+
+            # 保存每个表格上方小范围区域的文字，这部分内容包含了表格的标题和指标单位
+            table_info = {}
+            if (utils.check_table_title_black_list(line_text, title_list)
+                    or utils.check_table_title_black_list_button(line_text,button_list)):
+                db_service_word.insert_measure_parser_info({
+                    'file_id': file_id,
+                    'content': get_next_table_index(index,texts_dict,tables),
+                    'type': 'table_index',
+                }, conn, cursor)
+            if utils.check_table_title_black_list_measure(line_text):
+                db_service_word.insert_measure_parser_info_measure({
+                    'file_id': file_id,
+                    'content': get_next_table_index(index, texts_dict,tables),
+                    'type': 'measure_index',
+                }, conn, cursor, line_text)
+
+
+            if re.findall(unit_pattern, line_text):
+                #  为单位
+                table_info = get_table_unit_info(file_id,line_text,t["index"],t["index"]+1)
+
+                db_service_word.insert_table_unit_info_v1(table_info,conn,cursor)
+
+            if utils.check_table_title_black_list_measure(line_text):
+                db_service_word.insert_measure_parser_info_measure({
+                    'file_id': file_id,
+                    'content': f"{t['index']}_1",
+                    'type': 'measure_index',
+                }, conn, cursor, line_text)
+
+            if not utils.pdf_text_flag(line_text):
+                if utils.check_line_text(line_text):
+                    db_service_word.insert_word_parse_process({
+                    'file_id': file_id,
+                    'page_num' : t["index"],
+                    'page_count' : 100,
+                    'type' : 'parse_table',
+                    'content':{
+                        'page_num' : t["index"],
+                        'table_index' : t["index"],
+                        "type" : "text",
+                        'content' : line_text,
+                    }},conn,cursor,"word_parse_process")
+                    # 给慎用词校验用
+                    db_service_word.insert_word_parse_process({
+                        'file_id': file_id,
+                        'page_num': t["index"],
+                        'page_count': 100,
+                        'type': 'text',
+                        'content': {
+                            'page_num': t["index"],
+                            'table_index': t["index"],
+                            "type": "text",
+                            'content': line_text,
+                        }}, conn, cursor, "word_parse_data")
+
+            table_name = "word_text_info"
+            if type == 1:
+                table_name = "id_text_info"
+            # 写入数据库 传入表名
+            db_service_word.batch_insert_page_text({
+                'file_id': file_id,
+                'page_num' : t["index"],
+                'text' : line_text
+                },conn,cursor, table_name)
+
+       
+        for t in tables:
+            page_num = t["index"]
+            for lines in t["data"]:
+                lines = list(set(lines))
+                for line in lines:
+                    if len(line) == 0:
+                        continue
+                    db_service_word.batch_insert_page_text({
+                        'file_id': file_id,
+                        'page_num' : page_num,
+                        'text' : line
+                        },conn,cursor,"word_text_info")
+            
+
+    except Exception as e:
+        applog.error(f'文本处理异常{e}')
+
+            
+
+def get_table_unit_info(file_id,line_text,page_num,table_index):
+    table_info = {}
+    table_info['file_id'] = file_id
+    match = unit_pattern.search(line_text)
+    if match:
+        unit = match.group(2)   
+        table_info['unit'] = unit
+
+    table_info['page_num'] = page_num
+    table_info['table_index'] = table_index
+
+    return table_info
+
+
+def get_table_text_info(file_id,line_text,page_num,table_index):
+    table_info = {}
+    table_info['file_id'] = file_id
+    table_info['text_info'] = line_text
+    table_info['page_num'] = page_num
+    table_info['table_index'] = table_index
+
+    return table_info
+
+# 读取pdf中的表格,并将表格中指标和表头合并，eg: 2022年1季度营业收入为xxxxx
+def get_table_measure(file_id, word_tables, record_range):
+    """
+    :return: pdf中的表格,并将表格中指标和表头合并，eg: 2022年1季度营业收入为xxxxx
+    """ 
+    try:
+        redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
+        conn = mysql.connector.connect(
+            host = MYSQL_HOST,
+            user = MYSQL_USER,
+            password = MYSQL_PASSWORD,
+            database = MYSQL_DB
+        )
+
+        # 创建一个cursor对象来执行SQL语句
+        cursor = conn.cursor(buffered=True)
+        conn_app = mysql.connector.connect(
+            host = MYSQL_HOST_APP,
+            user = MYSQL_USER_APP,
+            password = MYSQL_PASSWORD_APP,
+            database = MYSQL_DB_APP
+        )
+
+        # 创建一个cursor对象来执行SQL语句
+        cursor_app = conn_app.cursor(buffered=True)
+
+        select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
+        cursor.execute(select_year_select)
+        record_select = cursor.fetchall()
+        report_type = record_select[0][0]
+        report_year = record_select[0][1]
+
+        client = MilvusClient(
+            uri= MILVUS_CLIENT
+        )
+        applog.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
+        start = time.time()
+
+        record_start = record_range.split('-')[0]
+        record_end = record_range.split('-')[1]
+        for index in range(int(record_start),int(record_end)):
+            t = word_tables[index][0]
+            measure_obj =[]
+            data_dict = {}
+            measure_list = []
+            try:
+                arr = np.array(t["data"])
+                rows, cols = arr.shape
+                if rows == 1 and cols == 1:
+                    continue  
+                
+                row_num , col_num = -1 , -1        
+
+                # 使用嵌套循环遍历数组，获取第一个数值位置
+                for i in range(rows):
+                    for j in range(cols):
+                        if j == 0 or i == 0:#防止第一列识别出数字
+                            continue
+                        measure_value_config = str(arr[i, j]).replace('(','').replace(')','')
+
+                        
+                        if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config):
+                            if j == cols-1:
+                                row_num, col_num = i, j
+                                break
+                            elif (re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config)
+                                or measure_value_config == '-'):
+                                row_num, col_num = i, j
+                                break      
+                    else:
+                        continue
+                    break
+                #  遍历数值二维数组，转成带语义的指标
+                if row_num != -1 and col_num != -1:
+                    for i in range(row_num,arr.shape[0]):
+                        for j in range(col_num,arr.shape[1]):
+                            measure_value = str(arr[i, j]).replace('%','').replace('(','-').replace(')','')
+                            if measure_value == '-' or measure_value == '' or len(measure_value) > 20:
+                                continue
+                            else:
+                                row_num_info = get_row_num_info(arr,row_num,col_num,i,j)
+                                col_num_info = get_col_num_info(arr,row_num,col_num,i,j)
+
+                                #如果上表头为空则认为是被截断，除了研发投入特殊处理其它过滤
+                                if row_num_info in ('','-',')','）'):
+                                    continue
+                                
+                                #特殊处理非经常性损益合计和非经常性损益净额同时出现时保留净额
+                                if col_num_info == '非经常性损益合计':
+                                    continue
+
+                                if utils.check_pdf_measure_black_list(f"{col_num_info}{row_num_info}"):
+                                    continue
+
+                                #去掉没有周期的指标
+                                if utils.check_pdf_measure(f"{col_num_info}{row_num_info}"):
+                                    continue
+
+                                #判断上表头和左表头周期是否一致，不一致过滤
+                                row_period = utils.get_period_type_other(row_num_info, report_year)
+                                col_period = utils.get_period_type_other(col_num_info, report_year)
+                                if(row_period != col_period and row_period != 'c_n' and col_period != 'c_n'):
+                                    continue
+                                units_mapping = {
+                                    "百万元": "百万元",
+                                    "千万元": "千万元",
+                                    "亿元": "亿元",
+                                    "万元": "万元",
+                                    "千元": "千元",
+                                    "元": "元",
+                                    "元/股": "元"
+                                }
+                                row_num_info = row_num_info.replace('%','增减')
+                                #num_info = f"{col_num_info}{row_num_info}".replace('（）','').replace('加：','').replace('减：','').replace('%','')
+                                num_info = utils.get_clean_text(f"{row_num_info}{col_num_info}")
+                                num_info_bak = utils.get_clean_text(f"{col_num_info}{row_num_info}")
+                                measure_unit = ''
+                                #"%": "同期增减"
+                                combined_info = f"{row_num_info} {col_num_info}"
+                                # for unit in units_mapping:
+                                #     if unit in row_num_info:
+                                #         measure_unit = units_mapping[unit]
+                                #         break
+                                if utils.get_percent_flag(row_num_info) == '1':
+                                    measure_unit = ''  
+                                else:
+                                    for unit in units_mapping:
+                                        if re.search(rf'\（\s*{unit}(\s*人民币)?\s*\）|\(\s*{unit}(\s*人民币)?\s*\)', combined_info) or (re.search(rf'{unit}', combined_info) and any(re.search('单位', item) for item in arr[0])):
+                                            measure_unit = units_mapping[unit]
+                                            break
+                                measure_list.append({
+                                    'measure_name': num_info,
+                                    'measure_value': measure_value,
+                                    'measure_unit':measure_unit,
+                                })
+                                measure_list.append({
+                                    'measure_name': num_info_bak,
+                                    'measure_value': measure_value,
+                                    'measure_unit':measure_unit,
+                                })
+
+                if not redis_client.exists(f'parsed_measure_count_{file_id}'):
+                    redis_client.set(f'parsed_measure_count_{file_id}', 0)
+                
+                redis_client.incr(f'parsed_measure_count_{file_id}')
+
+                if len(measure_list) > 0:
+                    data_dict["measure_list"] = measure_list
+                    data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
+                    data_dict['file_id'] = file_id
+                    measure_obj.append(data_dict)   
+                    db_service_word.insert_measure_data_to_milvus(client,measure_obj,cursor_app,conn_app)
+            except Exception as e:
+                applog.error(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
+                applog.error(f"错误是：{e}")
+        end = time.time()
+        applog.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
+    except Exception as e:
+        applog.error(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
+        record_start = record_range.split('-')[0]
+        record_end = record_range.split('-')[1]
+        for index in range(int(record_start),int(record_end)):
+            t = word_tables[index]
+            try:
+                arr = np.array(t['data'])
+            except Exception as e:
+                applog.error(f'这个错误是{e}的arr的值是{arr}')
+
+            
+    finally:
+        redis_client.close()
+        client.close()
+        cursor.close()
+        conn.close()
+        cursor_app.close()
+        conn_app.close()
+
+
+#指标归一化处理
+    
+def update_measure_data(file_id,file_path,parent_table_pages):
+    conn = mysql.connector.connect(
+        host = MYSQL_HOST,
+        user = MYSQL_USER,
+        password = MYSQL_PASSWORD,
+        database = MYSQL_DB
+    )
+
+    # 创建一个cursor对象来执行SQL语句
+    cursor = conn.cursor(buffered=True)
+    # #通过向量查询指标
+    conn_app = mysql.connector.connect(
+        host = MYSQL_HOST_APP,
+        user = MYSQL_USER_APP,
+        password = MYSQL_PASSWORD_APP,
+        database = MYSQL_DB_APP
+    )
+
+    # 创建一个cursor对象来执行SQL语句
+    cursor_app = conn_app.cursor(buffered=True)
+    applog.info(f'目录黑名单为：{parent_table_pages}')
+    # db_service_word.delete_to_run(conn,cursor,file_id)
+    db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
+
+    # #指标归一化处理
+    db_service_word.update_ori_measure(conn,cursor,file_id)
+    # db_service.delete_database(conn_app,cursor_app,file_id)
+    cursor.close()
+    conn.close()
+    cursor_app.close()
+    conn_app.close()
+
+def merge_consecutive_arrays(word_info):
+    merged_objects = []
+    temp_list = []
+
+    for info_obj in word_info:
+        try:
+            if info_obj['type'] == 'table':
+                # 如果对象是表格，将其元素添加到临时列表中
+                data = info_obj['data']
+                if not data:
+                    continue
+                first_row = data[0]
+                if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and  len(temp_list) == 0:
+                    temp_list.append(info_obj)
+                elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
+                    merged_objects.append(temp_list)
+                    temp_list = []
+                    temp_list.append(info_obj)
+                elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
+                    temp_data = temp_list[-1]['data']
+                    temp_data = list(temp_data)
+                    for row in  list(info_obj['data']):
+                        temp_data.append(row)
+                    info_obj['data'] = temp_data
+                    temp_list.clear()
+                    temp_list.append(info_obj)
+                    
+         
+        except Exception as e:
+
+            applog.error(f"解析数据错误: {e}")
+
+    if temp_list:
+        merged_objects.append(temp_list)    
+
+    return merged_objects
+
+def merge_consecutive_arrays_v1(pdf_info):
+    merged_objects = []
+    temp_array = {}
+
+    def is_same_dimension(data1, data2):
+        # 检查两个表的每行长度是否相同
+        if len(data1) != len(data2):
+            return False
+        return all(len(row1) == len(row2) for row1, row2 in zip(data1, data2))
+
+    for info_obj in pdf_info:
+        try:
+            if info_obj['type'] == 'table':
+                if not temp_array:
+                    # 如果临时列表为空，则初始化临时列表
+                    temp_array = info_obj
+                else:
+                    # 检查当前表与临时列表中的表是否同维度
+                    if is_same_dimension(temp_array['data'], info_obj['data']):
+                        # 如果是同维度，则合并数据
+                        temp_array['data'].extend(info_obj['data'])
+                    else:
+                        # 如果不是同维度，将现有临时列表添加到结果中，并重置临时列表
+                        merged_objects.append(temp_array)
+                        temp_array = info_obj
+            else:
+                # 如果对象不是表格，检查临时列表是否非空
+                if temp_array:
+                    # 将临时列表中的元素合并成一个数组，并添加到新的对象列表中
+                    merged_objects.append(temp_array)
+                    temp_array = {}  # 重置临时列表
+        except Exception as e:
+            applog.error(f"解析数据错误: {e}")
+
+    # 循环结束后，检查临时列表是否非空，如果非空，则添加到结果中
+    if temp_array:
+        merged_objects.append(temp_array)
+
+    return merged_objects
+def start_table_measure_job(file_id):
+    conn_app = mysql.connector.connect(
+        host = MYSQL_HOST_APP,
+        user = MYSQL_USER_APP,
+        password = MYSQL_PASSWORD_APP,
+        database = MYSQL_DB_APP
+    )
+
+    # 创建一个cursor对象来执行SQL语句
+    cursor_app = conn_app.cursor(buffered=True)
+    
+    select_process_query = '''
+                select DISTINCT content from word_parse_process WHERE file_id = '{file_id}' and type='parse_table' order by page_num
+                '''.format(file_id=file_id)
+    cursor_app.execute(select_process_query)
+    records = cursor_app.fetchall()
+    word_info = []
+    for record in records:
+        word_info.append(eval(record[0]))
+
+    # 获取table 数据
+    word_tables = merge_consecutive_arrays(word_info)
+    redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
+
+    redis_client.set(f'measure_count_{file_id}', len(word_tables))
+
+    cursor_app.close()
+    conn_app.close()
+    redis_client.close()
+
+    records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
+    processes = []
+    for record_range in records_range_parts:
+        p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
+        processes.append(p)
+        p.start()
+        
+    for p in processes:
+        p.join()
+
--- a/zzb_data_word/parse_word.py
+++ b/zzb_data_word/parse_word.py
@ -0,0 +1,269 @@
+from docx import Document
+import json
+from docx.oxml.table import CT_Tbl
+from docx.oxml.text.paragraph import CT_P
+from lxml import etree
+import os
+import zipfile
+
+RESULT_TYPE_TEXT = 'text'
+RESULT_TYPE_TABLE = 'table'
+
+def build_result(result_type, index, data):
+    return {
+        'type': result_type,
+        'index': index,
+        'data': data
+    }
+
+def build_catalog_result(index, depth, data):
+    return {
+        'index': index,
+        'depth': depth,
+        'data': data
+    }
+    
+# 解析docx文件中的XML内容
+def get_xml_content(docx_filename, xml_filename):
+    with zipfile.ZipFile(docx_filename) as z:
+        return z.read(xml_filename)
+
+def parse_paragraph(paragraph, index, namespaces):
+    paragraph_text = paragraph.text.strip() if paragraph else ''
+    if paragraph_text:
+        return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
+    return None
+
+def parse_table(table, index):
+    table_data = []
+    for row in table.rows:
+        row_data = [cell.text for cell in row.cells]
+        table_data.append(row_data)
+    return build_result(RESULT_TYPE_TABLE, index, table_data)
+
+def parse_paragraph_element(paragraph_element, index, namespaces):
+    paragraph_xml = etree.fromstring(paragraph_element.xml)
+    paragraph_text = ''.join(paragraph_xml.xpath('//w:t/text()', namespaces=namespaces)).strip()
+    if paragraph_text:
+        return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
+    return None
+
+def parse_table_element(table_element, index, namespaces):
+    table_xml = etree.fromstring(table_element.xml)
+    table_data = []
+    for row in table_xml.xpath('//w:tr', namespaces=namespaces):
+        row_data = []
+        for cell in row.xpath('./w:tc | ./w:sdt', namespaces=namespaces):
+            cell_text = ''.join(cell.xpath('.//w:t/text()', namespaces=namespaces)).strip()
+            grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
+            grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1    
+            if grid_span > 1:
+                row_data.extend([cell_text] * grid_span)
+            else:
+                row_data.append(cell_text)
+        table_data.append(row_data)
+    return build_result(RESULT_TYPE_TABLE, index, table_data)
+
+def add_to_catalog(element_xml, index, catalog_content, namespaces, paragraph_text, heading_styles):
+    p_element = etree.fromstring(element_xml)
+    # outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
+    # if outlineLvl:
+    #     level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
+    #     catalog_content.append(build_catalog_result(index, level, paragraph_text))
+    level = is_heading_paragraph(p_element, heading_styles, namespaces)
+    if level != -1:
+        catalog_content.append(build_catalog_result(index, level, paragraph_text))
+# 检查段落是否为标题样式
+def is_heading_paragraph(paragraph, heading_styles, namespaces):
+    pPr = paragraph.find('.//w:pPr', namespaces=namespaces)
+    if pPr is not None:
+        pStyle = pPr.find('.//w:pStyle', namespaces=namespaces)
+        pOutLineLvl = pPr.find('.//w:outlineLvl', namespaces=namespaces)
+        if pStyle is not None:
+            style_val = pStyle.get(f"{{{namespaces['w']}}}val")
+            if style_val.isdigit():
+                return int(style_val)
+        if pOutLineLvl is not None:
+            outLineLvl_val = pOutLineLvl.get(f"{{{namespaces['w']}}}val")
+            if outLineLvl_val.isdigit():
+                return int(outLineLvl_val) + 1
+        # if pStyle is not None and pStyle.get(ns['w'] + 'val') in heading_styles:
+        # if style_val > 0:
+        #     return True
+    return -1
+
+def get_paragraph_text(paragraph_element, namespaces):
+    paragraph_text = ''
+    for run in paragraph_element.findall('.//w:r', namespaces=namespaces):
+        for text in run.findall('.//w:t', namespaces=namespaces):
+            paragraph_text += text.text if text.text is not None else ''
+    return paragraph_text
+
+def add_to_catalog_paragraph(text, index, catalog_content, namespaces):
+    # 添加段落到目录
+    catalog_content.append(build_catalog_result(index, 1, text))  # 假设默认级别为1
+
+def parse_sdt_catalog(sdt_element, catalog_content, index, namespaces):
+    sdt_content = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
+    if sdt_content is not None:
+        for child in sdt_content:
+            if child.tag.endswith('p'):  # 内容控件中的段落
+                paragraph_text = get_paragraph_text(child, namespaces)
+                if paragraph_text.strip():  # 检查文本是否为空
+                    add_to_catalog_paragraph(paragraph_text, index, catalog_content, namespaces)
+                    index += 1  # 更新索引
+            elif child.tag.endswith('tbl'):  # 内容控件中的表格
+                # 处理表格内容（如果需要）
+                pass
+            elif child.tag.endswith('sdt'):  # 嵌套的内容控件
+                index = parse_sdt_catalog(child, catalog_content, index, namespaces)  # 递归解析嵌套的内容控件
+    return index
+
+def parse_docx(docx_path):
+    try:
+        document = Document(docx_path)
+        styles_xml = get_xml_content(docx_path, 'word/styles.xml')
+    except Exception as e:
+        print(f"Error loading document: {e}")
+        return None, None
+    
+    doc_content = []  # 内容（文本+表格）
+    catalog_content = []  # 目录
+    current_index = 1  # 维护全局的 index 变量
+    paragraph_index = 0
+    table_index = 0
+    # 获取整个文档的XML内容
+    xml_root = document.part.element
+    namespaces = xml_root.nsmap
+    
+    # 获取所有标题样式
+    styles_root = etree.fromstring(styles_xml)
+    heading_styles = set()
+    for style in styles_root.xpath('//w:style', namespaces=namespaces):
+        style_type = style.get(namespaces['w'] + 'type')
+        if style_type == 'paragraph' and style.get(namespaces['w'] + 'styleId').startswith('Heading'):
+            heading_styles.add(style.get(namespaces['w'] + 'styleId'))
+
+    # 遍历文档中的所有元素
+    for i, element in enumerate(document.element.body):
+        if isinstance(element, CT_P):  # 段落
+            paragraph_result = parse_paragraph_element(element, current_index, namespaces)
+            if paragraph_result:
+                doc_content.append(paragraph_result)
+                # 判断是否为目录，是就插入目录内容
+                paragraph = document.paragraphs[paragraph_index]
+                add_to_catalog(paragraph._element.xml, current_index, catalog_content, namespaces, paragraph.text, heading_styles)
+                current_index += 1  # 更新 index
+            paragraph_index += 1
+        elif isinstance(element, CT_Tbl):  # 表格
+            table_result = parse_table_element(element, current_index, namespaces)
+            if table_result:
+                doc_content.append(table_result)
+                current_index += 1  # 更新 index
+            table_index += 1
+        elif element.tag.endswith('sdt'):  # 内容控件
+            current_index = parse_sdt(element, doc_content, current_index, namespaces, catalog_content, heading_styles)  # 更新索引
+
+    return json.dumps(doc_content, indent=4, ensure_ascii=False), json.dumps(catalog_content, indent=4, ensure_ascii=False)
+
+
+
+def parse_sdt(sdt_element, doc_content, current_index, namespaces, catalog_content, heading_styles):
+    sdtContent = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
+    if sdtContent is not None:
+        for child in sdtContent:
+            if child.tag.endswith('p'):  # 内容控件中的段落
+                paragraph_text = ''
+                for run in child.findall('.//w:r', namespaces=namespaces):
+                    for text in run.findall('.//w:t', namespaces=namespaces):
+                        paragraph_text += text.text if text.text is not None else ''
+                if paragraph_text.strip():  # 检查文本是否为空
+                    doc_content.append(build_result(RESULT_TYPE_TEXT, current_index, paragraph_text.strip()))
+                    # 判断是否为目录，是就插入目录内容
+                    add_to_catalog(child.xml, current_index, catalog_content, namespaces, paragraph_text, heading_styles)
+                    current_index += 1  # 更新索引
+            elif child.tag.endswith('tbl'):  # 内容控件中的表格
+                table_data = []
+                merged_cells = {}  # 用于记录跨行单元格的信息
+                for row_idx, row in enumerate(child.findall('.//w:tr', namespaces=namespaces)):
+                    row_data = []
+                    for col_idx, cell in enumerate(row.findall('.//w:tc', namespaces=namespaces)):
+                        cell_text = ''
+                        for run in cell.findall('.//w:r', namespaces=namespaces):
+                            for text in run.findall('.//w:t', namespaces=namespaces):
+                                cell_text += text.text if text.text is not None else ''
+                        
+                        # 检查单元格是否跨列
+                        grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
+                        grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1                        
+                        if grid_span > 1:
+                            row_data.extend([cell_text.strip()] * grid_span)
+                        else:
+                            row_data.append(cell_text.strip())
+                        
+                        # 检查单元格是否跨行
+                        v_merge_xpath = etree.XPath('.//w:tcPr/w:vMerge/@w:val', namespaces=namespaces)
+                        v_merge = v_merge_xpath(cell)
+                        if v_merge and v_merge[0] == 'restart':
+                            merged_cells[(row_idx, col_idx)] = (int(grid_span), 1)
+                        elif v_merge and v_merge[0] == 'continue':
+                            if (row_idx - 1, col_idx) in merged_cells:
+                                merged_cells[(row_idx - 1, col_idx)] = (merged_cells[(row_idx - 1, col_idx)][0], merged_cells[(row_idx - 1, col_idx)][1] + 1)
+                            # 跨行单元格不需要再次添加到 row_data 中
+                        else:
+                            # 只有非跨行单元格才需要添加到 row_data 中
+                            pass
+                    
+                    # 处理跨行单元格
+                    for (r, c), (col_span, row_span) in list(merged_cells.items()):
+                        if r < row_idx:
+                            for i in range(row_span):
+                                if r + i == row_idx:
+                                    row_data[c:c] = [row_data[c]] * (col_span - 1)
+                                    break
+                            if r + row_span - 1 == row_idx:
+                                del merged_cells[(r, c)]
+                    
+                    table_data.append(row_data)
+                if table_data:  # 检查表格数据是否为空
+                    doc_content.append(build_result(RESULT_TYPE_TABLE, current_index, table_data))
+                    current_index += 1  # 更新索引
+            elif child.tag.endswith('sdt'):  # 嵌套的内容控件
+                current_index = parse_sdt(child, doc_content, current_index, namespaces, catalog_content, heading_styles)  # 递归解析嵌套的内容控件
+    return current_index  # 返回更新后的索引
+
+def split_text_table(json_data):
+    # 分组
+    text_elements = [element for element in json_data if element['type'] == 'text']
+    table_elements = [element for element in json_data if element['type'] == 'table']
+
+    # 转换为JSON字符串
+    text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
+    table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
+    
+    return text_elements_json, table_elements_json
+
+def append_to_file(file_path, text):
+    try:
+        with open(file_path, 'a', encoding='utf-8') as file:
+            file.write(text + '\n')
+    except Exception as e:
+        print(f"Error writing to file: {e}")
+
+if __name__ == "__main__":
+    current_directory = os.getcwd()
+    docx_relative_path = '101.docx'
+    file_relative_path = 'file\\docx\\test1.txt'
+    docx_path = os.path.join(current_directory, docx_relative_path)
+    file_path = os.path.join(current_directory, file_relative_path)
+    try:
+        parsed_content, catalog_content = parse_docx(docx_path)
+        if parsed_content and catalog_content:
+            json_parsed_content = json.loads(parsed_content)
+            text_elements_json, table_elements_json = split_text_table(json_parsed_content)
+
+            append_to_file(file_path, text_elements_json)
+            append_to_file(file_path, table_elements_json)
+            append_to_file(file_path, catalog_content)
+    except Exception as e:
+        print(f"Error parse_docx: {e}")
--- a/zzb_data_word/redis_service.py
+++ b/zzb_data_word/redis_service.py
@ -0,0 +1,17 @@
+import redis
+# 从 MySQL 表中读取数据并写入 Redis
+def read_from_file_and_write_to_redis(redis_client,ori_measure_id,measure_vector):
+    # Redis 连接配置    
+    redis_client.hset('measure_config',ori_measure_id, measure_vector)
+
+# 从 Redis 中读取数据
+def read_from_redis(redis_client,ori_measure_id):
+    # 获取所有键
+    return redis_client.hget('measure_config',ori_measure_id).decode()
+    
+# if __name__ == "__main__":
+#     # redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
+#     redis_client = redis.Redis(host='124.70.129.232', port=6379, password='Xgf_redis', db=6)
+#
+#     value = read_from_redis(redis_client,"92b44ffb50b6ab2068f5de447c9925")
+#     print(value)
--- a/zzb_data_word/requirements.txt
+++ b/zzb_data_word/requirements.txt
@ -0,0 +1,14 @@
+camelot-py==0.11.0
+pdfminer.six==20221105
+PyPDF2==3.0.1
+pdfplumber==0.10.3
+pymilvus==2.3.3
+mysql-connector-python==8.3.0
+dashscope==1.17.0
+fastapi
+pydantic
+uvicorn
+redis
+ghostscript
+opencv-python-headless
+python-docx
--- a/zzb_data_word/utils.py
+++ b/zzb_data_word/utils.py
@ -0,0 +1,818 @@
+#coding=utf-8
+
+import dashscope
+from http import HTTPStatus
+from pymilvus import MilvusClient
+import json
+from datetime import datetime
+import re,os,time
+import requests
+import config
+import numpy as np
+from docx2pdf import convert
+from config import api_key
+
+
+dashscope.api_key = api_key
+
+
+def get_md5(str):
+    import hashlib
+    m = hashlib.md5()
+    m.update(str.encode('utf-8'))
+    return m.hexdigest()
+
+def embed_with_str(input):
+    retry = 0
+    max_retry = 5
+    t = 0.1 
+    while retry < max_retry:
+        #阿里接口限流
+        time.sleep(t)
+        resp = dashscope.TextEmbedding.call(
+            model=dashscope.TextEmbedding.Models.text_embedding_v2,
+            input=input)
+        if resp.status_code == HTTPStatus.OK:
+            return resp
+        elif resp.status_code == 429:
+            print(f'触发限流,等待{t}秒后重试')
+            retry += 1
+            t+=0.1
+        else:
+            print(f'请求失败,状态码:{resp.status_code}')
+            return None
+    print('重试超过上限')
+    return None
+
+#如果存在‘归属于|扣非’，就保留括号内的内容，并去掉标点符号和中文数字。
+#如果存在季度关键词，就将括号内容替换为季度
+#如果存在‘±’，就将括号内容替换为同期增减
+#其他情况，就删掉括号内全部内容
+def get_clean_text(text):
+    text = text.replace('流动资产：','').replace('半年度','上半年')
+    #先对几个半年报的词做整理，防止向量识别不出来
+    terms = ["货币资金", "应收账款",'应付账款']
+    #这个是不要合计的
+    terms_2 = ["固定资产","短期借款","合同负债","在建工程","商誉","存货"]
+    #这个是需要调换位置的指标
+    #terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"]
+    #不可以出现同比之类的
+    terms_4 = ['比', '率', '占','至','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资']
+    dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
+    #dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"]
+    if any(term in text for term in terms_4):
+        return text
+    if len(text) <= 20:
+        for term in terms:
+            for date in dates:
+                if term in text and date in text:
+                    text = f"{date}{term}合计"
+                    return text
+    if len(text) <= 20:
+        for term in terms_2:
+            for date in dates:
+                if term in text and date in text:
+                    text = f"{date}{term}"
+                    return text
+
+    import re
+    replacement_dict = {
+        '加：': '',
+        '减：': '',
+        '%' : '',
+        '其中：': '',
+        '实际': '',
+        '/': '',
+        '重述后':'',
+        '年末金额':'年末',
+        '比重增减':'同比增减',
+        '比例':'同比',
+    }
+    #针对整个text做替换
+    def replace_all(text, replacements):
+        pattern = re.compile("|".join(map(re.escape, replacements.keys())))
+        return pattern.sub(lambda match: replacements[match.group(0)], text)
+    text = replace_all(text, replacement_dict)
+    #单独出现12月31日时，就剔除掉
+    pattern_year = r'(?<!2026年|2025年|2024年|2023年|2022年|2021年)12月31日'
+    text = re.sub(pattern_year, '', text)
+
+    pattern = r"\（[^）]*\）|\([^)]*\)"  # 增加英文括号的匹配
+    matches = re.findall(pattern, text)
+    quarter_keywords = {
+        "1-3月": "第一季度",
+        "第1季度": "第一季度",
+        "4-6月": "第二季度",
+        "第2季度": "第二季度",
+        "7-9月": "第三季度",
+        "第3季度": "第三季度",
+        "10-12月": "第四季度",
+        "第4季度": "第四季度",
+        "调整后": "调整后",
+        "增减":"增减",
+        "一": "",
+        "二": "",
+        "三": "",
+        "年内到期":"年内到期",
+        "1－6月":"",
+        "发行新股":"发行新股",
+    }
+    #针对text的括号内容进行识别判断
+    for match in matches:
+        month_keywords_found = re.search(r"归属于|扣非", match)
+        if not month_keywords_found:  # 改为不包含时的处理
+            replaced = False
+            for keyword, replacement in quarter_keywords.items():
+                if re.search(keyword, match):
+                    text = re.sub(re.escape(match), replacement, text)  #触发关键词替换
+                    replaced = True
+                    break
+            if not replaced:
+                text = re.sub(re.escape(match), "", text)  # 如果没有找到匹配的关键词，直接删除
+        else:# 如果包含特殊关键词，删除整个括号内容
+            text = re.sub(r"[^\w\s]", "", text)
+    return text
+
+def convert_docx_to_pdf(file_path):
+    # 检查文件是否为 .docx 格式
+    if file_path.lower().endswith('.docx'):
+        # 生成 PDF 文件路径
+        pdf_path = os.path.splitext(file_path)[0] + '.pdf'
+        
+        try:
+            # 执行转换
+            convert(file_path, pdf_path)
+            print(f"转换成功: {pdf_path}")
+        except Exception as e:
+            print(f"转换失败: {e}")
+    else:
+        print("错误: 文件必须是 .docx 格式。")
+
+def save_pdf_from_url(url, file_path):
+    from urllib.parse import unquote
+    # 发起 GET 请求并保存文件
+    response = requests.get(url)
+    local_file_path = ''
+    url = unquote(url)
+    # 检查响应状态码
+    if response.status_code == 200:
+        # 文件下载成功
+        url_without_params = url.split('?')[0]
+        # 从处理后的URL中提取文件名
+        # 提取文件名
+        file_name = url_without_params.split('/')[-1]
+        #https://financial-report-test.obs.cn-east-3.myhuaweicloud.com:443/upload/file/909f3dd3337a4dd4bc24fb4748c6c76e.PDF?AccessKeyId=IIDIMIUZ1UBBVPKIVB4W&Expires=1726798358&Signature=fKgrDPjmd99Nje4wwvBJxmFlXZY%3D
+        # 指定本地文件保存路径
+        local_file_path = file_path + file_name
+        # local_file_path = convert_docx_to_pdf(local_file_path)
+
+        with open(local_file_path, 'wb') as file:
+            file.write(response.content)
+            print(f"文件已下载到 {local_file_path}")
+    else:
+        # 文件下载失败
+        print(f"无法下载文件，状态码：{response.status_code}")
+
+    return local_file_path
+
+def get_range(count,parts_num):
+    # 获取 CPU 核数
+    if count < parts_num:
+        parts_num = count
+    # 使用 divmod() 函数计算除法结果和余数
+    quotient, remainder = divmod(count, parts_num)
+    count_range_parts = []
+    for i in range(parts_num):
+        start_num = i * quotient
+        if i < parts_num-1:
+            start_num = i * quotient
+            end_num = start_num+quotient
+        else:
+            end_num = count
+        count_range_parts.append(f'{start_num}-{end_num}')
+    return count_range_parts
+
+def cosine_similarity(vector_a, vector_b):
+    # 将向量转换为 NumPy 数组
+    vector_a = np.array(vector_a)
+    vector_b = np.array(vector_b)
+    
+    # 计算两个向量的点积
+    dot_product = np.dot(vector_a, vector_b)
+    
+    # 计算两个向量的欧几里得范数
+    norm_a = np.linalg.norm(vector_a)
+    norm_b = np.linalg.norm(vector_b)
+    
+    # 计算余弦相似度
+    cosine_sim = dot_product / (norm_a * norm_b)
+    
+    return cosine_sim
+
+def get_period_type(text, year):
+    l_year = f'{int(year)-1}'
+    bl_year = f'{int(year)-2}'
+    c_period = f'当期|本期|本报告期|报告期|本年|本期|{year}'
+    l_period = f'上年|上期|上年度|{l_year}'
+    bl_period = f'前年|{bl_year}'
+
+    if len(re.findall(c_period, text)) > 0:
+        return 'c'
+    elif len(re.findall(l_period, text)) > 0:
+        return 'l'
+    elif len(re.findall(bl_period, text)) > 0:
+        return 'bl'
+    else:
+        return 'c'
+
+def get_period_type_other(text, year):
+    l_year = f'{int(year)-1}'
+    bl_year = f'{int(year)-2}'
+    c_period = f'当期|本期|本报告期|报告期|本年|本期|{year}'
+    l_period = f'上年|上期|上年度|{l_year}'
+    bl_period = f'前年|{bl_year}'
+
+    if len(re.findall(c_period, text)) > 0:
+        return 'c'
+    elif len(re.findall(l_period, text)) > 0:
+        return 'l'
+    elif len(re.findall(bl_period, text)) > 0:
+        return 'bl'
+    else:
+        return 'c_n'
+
+def get_start_period_type(text):
+    s_period = '期初|1月1日|年初'
+
+    if len(re.findall(s_period, text)) > 0:
+        return ''
+    else:
+        return '0'
+
+def get_season_flag(text):
+    season_period = '第1季度|第2季度|第3季度|第4季度|一季度|二季度|三季度|四季度|1-3月|4-6月|7-9月|10-12月'
+    if len(re.findall(season_period, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_percent_flag(text):
+    percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|变动比例|本年度比上年度增减|增减'
+    if len(re.findall(percent_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_kf_flag(text):
+    kf_word = '扣非|扣除非经常性损益'
+    if len(re.findall(kf_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_report_start(text):
+    kf_word = '报告期初|1月1日'
+    if len(re.findall(kf_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def get_percent_growth(text):
+    percent_growth_word = '变动|本年比上年|比例同比增减|比例同比上升|比例同比下降|比例变化幅度|比例变动比例|比例本期比上年同期增减|比例本年比上年增减|比例同比变动|比例本期期末金额较上期期末变动比例|比率同比增减|比率同比上升|比率同比下降|比率变化幅度|比率变动比例|比率本期比上年同期增减|比率本年比上年增减|比率同比变动|比率本期期末金额较上期期末变动比例|占比同比增减|占比同比上升|占比同比下降|占比变化幅度|占比变动比例|占比本期比上年同期增减|占比本年比上年增减|占比同比变动|占比本期期末金额较上期期末变动比例|费用同比增减|费用同比上升|费用同比下降|费用变化幅度|费用变动比例|费用本期比上年同期增减|费用本年比上年增减|费用同比变动|费用本期期末金额较上期期末变动比例'
+    if len(re.findall(percent_growth_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+def check_black_list(meta_measure, pdf_measure, black_array):
+    # 获取黑名单数据
+    #black_array = fetch_black_list_data(cursor)
+
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure==black_meta:
+            for pdf in black_pdfs:
+                if pdf_measure.find(pdf) >= 0:
+                    return True
+    return False
+
+def check_black_list_old(meta_measure,pdf_measure):
+    # 判断指标名是否包含黑名单词
+    #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
+    black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
+                   ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
+                   ,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
+                   ,'扣非净利润:净资产,净利率,年度公司'
+                   ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
+                   ,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
+                   ,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
+                   ,'非经常性损益:扣除非经常性损益'
+                   ,'基本每股收益:稀释每股收益,发行新股'
+                   ,'稀释每股收益:基本每股收益,发行新股'
+                   ,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
+                   ,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
+                   ,'应付账款:应收账款,年以上,内,至,到'
+                   ,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
+                   ,'研发投入:比例,比率,占比,费用,占'
+                   ,'资本化研发投入:比例,比率,占比,费用,占'
+                   ,'资本化研发投入占比:金额,费用'
+                   ,'研发投入占营业收入比例:金额,费用'
+                   ,'上年年末:1月1日'
+                   ,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
+                   ,'期扣非加权平均净资产收益率:同比,年化,每股'
+                   ,'加权平均净资产收益率同比变动:年化,每股'
+                   ,'研发费用:制造,投入,直接,管理'
+                   ,'应收账款:1-2年','货币资金:在途'
+                   ,'当期:2023年1-6月,调整后'
+                   ,'营业成本:营业总成本'
+                   ,'长期借债:年内到期','研发投入:直接'
+                   ,'第一季度:第二季度,第三季度,第四季度'
+                   ,'第二季度:第一季度,第三季度,第四季度'
+                   ,'第三季度:第二季度,第一季度,第四季度'
+                   ,'第四季度:第二季度,第三季度,第一季度'
+                   ,'研发费用:研发支出,研发投入','存货:跌价准备'
+                   ,'费用:日常,付现','固定资产:改良,补助,投资']
+    # current_period = f'当期:{report_year}年1-6月'
+    # black_array.append(current_period)
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure.find(black_meta) >= 0:
+            for pdf in black_pdfs:
+                if pdf_measure.find(pdf) >= 0:
+                    return True
+    return False
+    
+def check_white_list(meta_measure,pdf_measure):
+    white_array = ['基本每股收益:每股收益','加权平均净资产收益率同比变动:比','季度变动比例:比']
+    for black in white_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if black_meta in meta_measure:
+            for pdf in black_pdfs:
+                if pdf_measure.find(pdf) < 0:
+                    return True
+    return False
+
+def check_title_black_list(meta_measure,text_info):
+    # 判断指标名是否包含黑名单词
+    black_array = ['营业收入:前五名,前5名,合计','营业成本:合计','财务费用:现金流','销售费用:现金流','管理费用:现金流','研发费用:现金流','非经常性损益:合计']
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        if meta_measure.find(black_meta) >= 0:
+            for pdf in black_pdfs:
+                if text_info.find(pdf) >= 0:
+                    return True
+    return False
+
+# 文本中数字的占比
+def under_non_alpha_ratio(text: str, threshold: float = 0.6):
+    
+        if len(text) == 0:
+            return False
+    
+        alpha_count = len([char for char in text if char.strip() and char.isalpha()])
+        total_count = len([char for char in text if char.strip()])
+        try:
+            ratio = alpha_count / total_count
+            return ratio <= threshold
+        except:
+            return False
+def check_table_title_black_list(text,table_title_black_list):#report_year
+    #previous_year = int(report_year) - 1
+    if table_title_black_list is None:
+        return False
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+#通过关键词黑名单匹配表格上方的文本区域，提取需要过滤的表格
+def check_table_title_black_list_old(text,report_year):#report_year
+    previous_year = int(report_year) - 1
+    table_title_black_list = f"""所有权或使用权受到限制的资产|持有待售资产|关联交易|未确认递延所得税资产明细|{previous_year}年度|{previous_year}年1-6月|自{previous_year}年1月1日至6月30日止期间|流动性风险|关联交易|账龄超过|流动风险|公司资产负债表|按账龄组合|线上直营|线上直销|公司现金流量表|公司利润表|应收账款|在建工程|固定资产|其他与筹资活动有关的现金|汇率风险|市场风险|主营业务收入|主营收入|其他收入|前五名|前5名|经营活动有关的现金|股份变动对最近一年和最近一期每股收益、每股净资产等财务指标的影响|合同产生的收入情况|子公司|参股公司|控股公司|分解信息|经营活动产生的现金|行业分类|产品分类|地区分类|业绩快报|销售渠道|调整情况说明|合同分类|计入当期损益的政府补助|股份变动对最近一年和最近一期|分部的财务信息|显示服务创收|线上销售情况|试运行销售|会计政策变更|品牌经营业务|工程施工业务|开发业务|制造业务|合营安排或联营企业中的权益|联营企业的主要财务信息|汇率及通货膨胀|与金融工具相关的风险|运营业务|B端业务|终止经营现金流量|终止经营|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|母公司|现金流量表补充|直营店店效情况|担保人2023年度未经审计的|外汇风险|公司各业务板块经营情况|报告期确认的包括在合同负债期初账面价值中的收入|资产受限情况|资产权利受限情况|内控自我评价报告|所有权或使用权受限资产|合并日被合并方资产、负债的账面价值|经营租赁资产|前5|前五|②|不属于现金及现金等价物的货币资金|按销售模式分|按产品类别分|按照销售区域|产品类别|销售模式|经销模式|关键管理人员|截至{previous_year}年6月30日止六个月期间|关联方提供的存款及贷款服务|报告期内各销售渠道的盈利情况|报告期内各地区的盈利情况|报告期内各产品的盈利情况|其他非流动负债|关联方提供的存款及贷款服务|自营销售分商品类别数据|组合计提|考核指标|不属于现金及现金等价物的货币资金|应收款项融资|本期计提、收回或转回的坏账准备情况|存货跌价准备|持有待售负债"""
+    
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+#通过关键词黑名单匹配页面下方的文本区域，提取需要过滤的表格
+
+def check_table_title_black_list_button(text,table_title_black_list):
+
+    if table_title_black_list is None:
+        return False
+
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+def check_table_title_black_list_button_old(text):
+
+    table_title_black_list = """公司资产负债表|公司现金流量表|公司利润表|主营业务收入|主营收入|其他收入|前五名|前5名|经营活动有关的现金|股份变动对最近一年和最近一期每股收益、每股净资产等财务指标的影响|合同产生的收入情况|子公司|参股公司|控股公司|分解信息|经营活动产生的现金|2022年度|行业分类|产品分类|地区分类|业绩快报|销售渠道|调整情况说明|合同分类|计入当期损益政府补助|股份变动对最近一年和最近一期|分部的财务信息|显示服务创收|线上销售情况|试运行销售|品牌经营业务|工程施工业务|开发业务|制造业务|合营安排或联营企业中的权益|联营企业的主要财务信息|汇率及通货膨胀|与金融工具相关的风险|运营业务|B端业务|终止经营现金流量|终止经营|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|不属于现金及现金等价物的货币资金|经营租赁资产|分地区|分产品|分行业|使用权受限资产|资产受限情况|经销模式|持续的第三层次公允价值计量项目，期初与期末账面价值间的调节信息及不可观察参数敏感|权利受限情况|应收款项融资|本期计提、收回或转回的坏账准备情况"""
+    
+    
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    if re.search(r'上年度\s*$', text):
+        return True
+    return False
+def check_table_title_black_list_measure(text):
+    #black_array = ['补充资料:研发费用,管理费用,财务费用'
+                #    ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
+                   #]
+    table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产|计息金融工具|坏账准备"""
+    if len(re.findall(table_title_black_list, text)) > 0:
+        return True
+    return False
+#过滤原始指标中包含黑名单
+def check_pdf_measure_black_list(text):
+    pdf_measure_black_list = '股权变动前|股权变动后|含股份支付|境内|境外|调整前|有限公司|责任公司|其他|变更前|差异|同口径|调整金额'
+    if len(re.findall(pdf_measure_black_list, text)) > 0:
+        return True
+    if "其中：营业收入" in text:
+        return False
+    if "同比" in text and "额" in text:
+        #if text.find("同比") < text.find("额"):
+        if text.endswith("额"):
+            return True
+    return False
+
+
+def check_pdf_measure(pdf_measure):
+    keywords_1 = [
+        '2022年', '2023年', '2021年', '第一季度', '第二季度', '第三季度', '第四季度', '增减', '变动', '本期','同期', '当期', '报告期', '前年',
+        '上年', '上期', '本年', '1-3月', '4-6月', '7-9月', '10-12月'
+    ]
+    
+    keywords_2 = ['这里是一个测试文本']
+    
+    contain_keyword_1 = any(keyword in pdf_measure for keyword in keywords_1)
+    contain_keyword_2 = any(keyword in pdf_measure for keyword in keywords_2)
+    #只有 未出现周期，同时出现了'调整后'才会删掉指标
+    if not contain_keyword_1 and contain_keyword_2:
+        return True
+    return False
+# def check_white_list(meta_measure,pdf_measure):
+#     # 判断指标名是否包含白名单词
+#     black_array = ['营业收入:营业外收入,主营业务,营业总收入,扣除','归母净利润:净资产,净利率,扣除','扣非净利润:净资产,净利率','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
+#     for black in black_array:
+#         black_meta = black.split(':')[0]
+#         black_pdfs = black.split(':')[1].split(',')
+#         if meta_measure.find(black_meta) >= 0:
+#             for pdf in black_pdfs:
+#                 if pdf_measure.find(pdf) >= 0:
+#                     return True
+#     return False
+def check_line_text(line_text):
+    if line_text == 'PAGE':
+        return False
+    if line_text == '（续）':
+        return False
+    if line_text.endswith('（续）'):
+        return False
+    if line_text.endswith("年度财务报表") and "有限公司" in line_text:
+        return False
+    if len(line_text) < 20 and line_text.endswith("有限公司"):
+        return False
+    substrings = [
+        '对内加快发展方式绿色转型、对外形成绿色生产和生活方式',
+        '可持续发展、创新发展；“8”是八大绿色行动',
+        '色新赋能、催生绿色新科技、筑牢绿色新支撑',
+        '接上表','续上表',
+    ]
+    for substring in substrings:
+        if substring in line_text:
+            return False
+    return True
+
+def pdf_text_flag(text : str):
+    if under_non_alpha_ratio(text):
+        return True
+    
+    if len(text) < 5:
+        return True
+    
+    if not re.findall(',|，|。|、|（|）',text):
+        return True
+    
+    if text.find('适用') != -1 and text.find('不适用') != -1:
+        return True
+
+    if text.find('是') != -1 and text.find('否') != -1:
+        return True
+
+    return False
+
+def get_change_rate_flag(text):
+    percent_word = '同比增减|同比上升|同比下降|变化幅度|变动比例|本期比上年同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
+    if len(re.findall(percent_word, text)) > 0:
+        return '1'
+    else:
+        return '0'
+
+def check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
+    content_value = f"{table_num}_{table_index}"
+    measure_index_array = []
+    select_measure_index_query = '''
+         SELECT DISTINCT text FROM measure_parser_info_linetext WHERE file_id = %s AND type = 'measure_index' and content = %s
+    '''
+    cursor_app.execute(select_measure_index_query, (file_id,content_value,))
+    measure_index_records = cursor_app.fetchall()
+    for measure_index_record in measure_index_records:
+        measure_index_array.append(measure_index_record[0])
+    black_array = ['补充资料:研发费用,管理费用,财务费用,销售费用'
+                   ,'测试标题:测试指标'
+                   ,'其他非流动负债:合同负债'
+                   ,'应收款项融资:应收账款'
+                   ,'本期计提、收回或转回的坏账准备情况:应收账款'
+                   ,'筹资活动产生的各项负债变动情况:短期借款,长期借款'
+                   ,'持有待售资产:固定资产'
+                   ,'账龄超过 1 年或逾期的重要应付账款:应付账款'
+                   ,'经营租赁资产:固定资产'
+                   ,'计息金融工具:货币资金,短期借款,交易性金融资产'
+                   ,'坏账准备:应收账款'
+                   ]
+    for black in black_array:
+        black_meta = black.split(':')[0]
+        black_pdfs = black.split(':')[1].split(',')
+        #if measure_index_array.find(black_meta) >= 0:
+        #if black_meta in measure_index_array:
+        if any(black_meta in measure_index for measure_index in measure_index_array):
+            if any(pdf in pdf_measure for pdf in black_pdfs):
+            #for pdf in black_pdfs:
+                #if pdf in pdf_measure:
+                #if pdf_measure.find(pdf) >= 0:
+                return True
+    return False
+def check_black_table_list(data):
+    black_array = ['补充资料:研发费用,管理费用,财务费用,销售费用',
+                   #'补充目录:母公司'
+                   ]
+    for black in black_array:
+            black_meta = black.split(':')[0]
+            black_pdfs = black.split(':')[1].split(',')
+            if any(black_meta in cell for row in data for cell in row):
+                print(data)
+                for pdf in black_pdfs:
+                    data = [row for row in data if not any(pdf in cell for cell in row)]
+    return data
+
+if __name__ == '__main__':
+
+    print(len('我是我'))
+
+    # print(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
+    # title = '母公司财务报表主要项目注释'
+    # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
+    #     print('1')
+    # else:
+    #     print('0')
+
+    # print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
+    # test = '2023年1-12月'
+    # print(get_period_type('上年度本期费用化研发投入'))
+    # print(get_period_type('费用化研发投入本年度'))
+    # vector_a = embed_with_str('第一季度营业收入')
+    # vector = vector_a.output["embeddings"][0]["embedding"]
+
+    # vector_b = embed_with_str('营业收入第一季度')
+    # vector1 = vector_b.output["embeddings"][0]["embedding"]
+
+    # similarity = cosine_similarity(vector, vector1)
+    # print(f"余弦相似度: {similarity}")
+
+    # measure_data = [
+    #     '1,1,营业收入2023年金额,1003535799.51',
+    #     '1,1,营业收入2022年金额,869401513.71',
+    #     '1,1,营业收入变动比例,15.43%',
+    #     '1,1,营业成本2023年金额,810779075.89',
+    #     '1,1,营业成本2023年占营业收入的比重,80.79%',
+    #     '1,1,营业成本2022年金额,702990363.57',
+    #     '1,1,营业成本2022年占营业收入的比重,80.86%',
+    #     '1,1,营业成本变动比例,15.33%',
+    #     '1,1,毛利率2023年金额,19.21%',
+    #     '1,1,毛利率2022年金额,19.14%',
+    #     '1,1,销售费用2023年金额,34065464.60',
+    #     '1,1,销售费用2023年占营业收入的比重,3.39%',
+    #     '1,1,销售费用2022年金额,28038106.19',
+    #     '1,1,销售费用2022年占营业收入的比重,3.22%',
+    #     '1,1,销售费用变动比例,21.50%',
+    #     '1,1,管理费用2023年金额,50807308.69',
+    #     '1,1,管理费用2023年占营业收入的比重,5.06%',
+    #     '1,1,管理费用2022年金额,38251704.48',
+    #     '1,1,管理费用2022年占营业收入的比重,4.40%',
+    #     '1,1,管理费用变动比例,32.82%',
+    #     '1,1,研发费用2023年金额,35312198.23',
+    #     '1,1,研发费用2023年占营业收入的比重,3.52%',
+    #     '1,1,研发费用2022年金额,30081787.99',
+    #     '1,1,研发费用2022年占营业收入的比重,3.46%',
+    #     '1,1,研发费用变动比例,17.39%',
+    #     '1,1,财务费用2023年金额,8015604.52',
+    #     '1,1,财务费用2023年占营业收入的比重,0.80%',
+    #     '1,1,财务费用2022年金额,5739677.85',
+    #     '1,1,财务费用2022年占营业收入的比重,0.66%',
+    #     '1,1,财务费用变动比例,39.65%',
+    #     '1,1,信用减值损失2023年金额,-11873626.82',
+    #     '1,1,信用减值损失2023年占营业收入的比重,-1.18%',
+    #     '1,1,信用减值损失2022年金额,-8903293.61',
+    #     '1,1,信用减值损失2022年占营业收入的比重,-1.02%',
+    #     '1,1,信用减值损失变动比例,33.36%',
+    #     '1,1,资产减值损失2023年金额,-2328729.46',
+    #     '1,1,资产减值损失2023年占营业收入的比重,-0.23%',
+    #     '1,1,资产减值损失2022年金额,-2285987.53',
+    #     '1,1,资产减值损失2022年占营业收入的比重,-0.26%',
+    #     '1,1,资产减值损失变动比例,1.87%',
+    #     '1,1,其他收益2023年金额,17886048.88',
+    #     '1,1,其他收益2023年占营业收入的比重,1.78%',
+    #     '1,1,其他收益2022年金额,11025908.32',
+    #     '1,1,其他收益2022年占营业收入的比重,1.27%',
+    #     '1,1,其他收益变动比例,62.22%',
+    #     '1,1,投资收益2023年金额,323361.47',
+    #     '1,1,投资收益2023年占营业收入的比重,0.03%',
+    #     '1,1,投资收益2022年金额,1119730.43',
+    #     '1,1,投资收益2022年占营业收入的比重,0.13%',
+    #     '1,1,投资收益变动比例,-71.12%',
+    #     '1,1,公允价值变动收益2023年占营业收入的比重,0.00%',
+    #     '1,1,公允价值变动收益2022年金额,10183.62',
+    #     '1,1,公允价值变动收益2022年占营业收入的比重,0.00%',
+    #     '1,1,公允价值变动收益变动比例,-100.00%',
+    #     '1,1,资产处置收益2023年金额,12782544.48',
+    #     '1,1,资产处置收益2023年占营业收入的比重,1.27%',
+    #     '1,1,资产处置收益2022年金额,-59.56',
+    #     '1,1,资产处置收益2022年占营业收入的比重,0.00%',
+    #     '1,1,资产处置收益变动比例,21461726.06%',
+    #     '1,1,汇兑收益2023年金额,0',
+    #     '1,1,汇兑收益2023年占营业收入的比重,0%',
+    #     '1,1,汇兑收益2022年金额,0',
+    #     '1,1,汇兑收益2022年占营业收入的比重,0%',
+    #     '1,1,汇兑收益变动比例,0%',
+    #     '1,1,营业利润2023年金额,76175407.00',
+    #     '1,1,营业利润2023年占营业收入的比重,7.59%',
+    #     '1,1,营业利润2022年金额,63332601.81',
+    #     '1,1,营业利润2022年占营业收入的比重,7.28%',
+    #     '1,1,营业利润变动比例,20.28%',
+    #     '1,1,营业外收入2023年金额,5788307.99',
+    #     '1,1,营业外收入2023年占营业收入的比重,0.58%',
+    #     '1,1,营业外收入2022年金额,1083997.19',
+    #     '1,1,营业外收入2022年占营业收入的比重,0.12%',
+    #     '1,1,营业外收入变动比例,433.98%',
+    #     '1,1,营业外支出2023年金额,687271.68',
+    #     '1,1,营业外支出2023年占营业收入的比重,0.07%',
+    #     '1,1,营业外支出2022年金额,1554243.54',
+    #     '1,1,营业外支出2022年占营业收入的比重,0.18%',
+    #     '1,1,营业外支出变动比例,-55.78%',
+    #     '1,1,净利润2023年金额,72975283.09',
+    #     '1,1,净利润2023年占营业收入的比重,7.27%',
+    #     '1,1,净利润2022年金额,57747603.98',
+    #     '1,1,净利润2022年占营业收入的比重,6.64%',
+    #     '1,1,净利润变动比例,26.37%',
+    #     '1,1,税金及附加2023年金额,5170339.13',
+    #     '1,1,税金及附加2023年占营业收入的比重,0.52%',
+    #     '1,1,税金及附加2022年金额,1933753.49',
+    #     '1,1,税金及附加2022年占营业收入的比重,0.22%',
+    #     '1,1,税金及附加变动比例,167.37%',
+    #     '1,1,所得税费用2023年金额,8301160.22',
+    #     '1,1,所得税费用2023年占营业收入的比重,0.83%',
+    #     '1,1,所得税费用2022年金额,5114751.48',
+    #     '1,1,所得税费用2022年占营业收入的比重,0.59%',
+    #     '1,1,所得税费用变动比例,62.30%',
+    #     '1,1,少数股东损益2023年金额,-58350.22',
+    #     '1,1,少数股东损益2023年占营业收入的比重,-0.01%',
+    #     '1,1,少数股东损益2022年金额,-946.60',
+    #     '1,1,少数股东损益2022年占营业收入的比重,0.00%',
+    #     '1,1,少数股东损益变动比例,-6064.19%',
+    #     '1,1,归属于母公司所有者的净利润2023年金额,73033633.31',
+    #     '1,1,归属于母公司所有者的净利润2023年占营业收入的比重,7.28%',
+    #     '1,1,归属于母公司所有者的净利润2022年金额,57748550.58',
+    #     '1,1,归属于母公司所有者的净利润2022年占营业收入的比重,6.64%',
+    #     '1,1,归属于母公司所有者的净利润变动比例,26.47%',
+    #     '1,1,归属于少数股东的综合收益总额2023年金额,-58350.22',
+    #     '1,1,归属于少数股东的综合收益总额2023年占营业收入的比重,-0.01%',
+    #     '1,1,归属于少数股东的综合收益总额2022年金额,-946.60',
+    #     '1,1,归属于少数股东的综合收益总额2022年占营业收入的比重,0.00%',
+    #     '1,1,归属于少数股东的综合收益总额变动比例,-6064.19%',
+    #     '1,1,归属于母公司所有者的综合收益总额2023年金额,73033633.31',
+    #     '1,1,归属于母公司所有者的综合收益总额2023年占营业收入的比重,7.28%',
+    #     '1,1,归属于母公司所有者的综合收益总额2022年金额,57748550.58',
+    #     '1,1,归属于母公司所有者的综合收益总额2022年占营业收入的比重,6.64%',
+    #     '1,1,归属于母公司所有者的综合收益总额变动比例,26.47%',
+    #     '2,1,主营业务收入2023年,983698831.48',
+    #     '2,1,主营业务收入2022年,854682261.31',
+    #     '2,1,主营业务收入变动比例,15.10%',
+    #     '2,1,其他业务收入2023年,19836968.03',
+    #     '2,1,其他业务收入2022年,14719252.40',
+    #     '2,1,其他业务收入变动比例,34.77%',
+    #     '2,1,主营业务成本2023年,793604607.43',
+    #     '2,1,主营业务成本2022年,690932741.27',
+    #     '2,1,主营业务成本变动比例,14.86%',
+    #     '2,1,其他业务成本2023年,17174468.46',
+    #     '2,1,其他业务成本2022年,12057622.30',
+    #     '2,1,其他业务成本变动比例,42.44%',
+    #     '3,1,变压器营业收入,490028234.05',
+    #     '3,1,变压器营业成本,402179824.08',
+    #     '3,1,变压器毛利率,17.93%',
+    #     '3,1,变压器营业收入比上年同期增减,16.22%',
+    #     '3,1,变压器营业成本比上年同期增减,16.33%',
+    #     '3,1,变压器毛利率比上年同期增减,减少0.07个百分点',
+    #     '3,1,高低压成套开关设备营业收入,261342442.26',
+    #     '3,1,高低压成套开关设备营业成本,206645237.99',
+    #     '3,1,高低压成套开关设备毛利率,20.93%',
+    #     '3,1,高低压成套开关设备营业收入比上年同期增减,-8.93%',
+    #     '3,1,高低压成套开关设备营业成本比上年同期增减,-9.91%',
+    #     '3,1,高低压成套开关设备毛利率比上年同期增减,增加0.86个百分点',
+    #     '3,1,户外成套设备营业收入,198013248.27',
+    #     '3,1,户外成套设备营业成本,157856817.84',
+    #     '3,1,户外成套设备毛利率,20.28%',
+    #     '3,1,户外成套设备营业收入比上年同期增减,62.25%',
+    #     '3,1,户外成套设备营业成本比上年同期增减,65.30%',
+    #     '3,1,户外成套设备毛利率比上年同期增减,减少1.47个百分点',
+    #     '3,1,其他营业收入,54151874.93',
+    #     '3,1,其他营业成本,44097195.98',
+    #     '3,1,其他毛利率,18.57%',
+    #     '3,1,其他营业收入比上年同期增减,39.68%',
+    #     '3,1,其他营业成本比上年同期增减,36.10%',
+    #     '3,1,其他毛利率比上年同期增减,增加2.14个百分点',
+    #     '3,1,合计营业收入,1003535799.51',
+    #     '3,1,合计营业成本,810779075.89',
+    #     '3,2,东北地区营业收入,2425280.53',
+    #     '3,2,东北地区营业成本,1427939.37',
+    #     '3,2,东北地区毛利率,41.12%',
+    #     '3,2,东北地区营业收入比上年同期增减,-69.51%',
+    #     '3,2,东北地区营业成本比上年同期增减,-77.58%',
+    #     '3,2,东北地区毛利率比上年同期增减,增加21.20个百分点',
+    #     '3,2,华北地区营业收入,70542020.62',
+    #     '3,2,华北地区营业成本,53044055.18',
+    #     '3,2,华北地区毛利率,24.81%',
+    #     '3,2,华北地区营业收入比上年同期增减,205.32%',
+    #     '3,2,华北地区营业成本比上年同期增减,203.18%',
+    #     '3,2,华北地区毛利率比上年同期增减,增加0.54个百分点',
+    #     '3,2,华东地区营业收入,770352353.33',
+    #     '3,2,华东地区营业成本,636803535.34',
+    #     '3,2,华东地区毛利率,17.34%',
+    #     '3,2,华东地区营业收入比上年同期增减,24.17%',
+    #     '3,2,华东地区营业成本比上年同期增减,25.30%',
+    #     '3,2,华东地区毛利率比上年同期增减,减少0.74个百分点',
+    #     '3,2,华南地区营业收入,18509519.71',
+    #     '3,2,华南地区营业成本,14496855.46',
+    #     '3,2,华南地区毛利率,21.68%',
+    #     '3,2,华南地区营业收入比上年同期增减,-57.08%',
+    #     '3,2,华南地区营业成本比上年同期增减,-57.98%',
+    #     '3,2,华南地区毛利率比上年同期增减,增加1.67个百分点',
+    #     '3,2,华中地区营业收入,60588394.64',
+    #     '3,2,华中地区营业成本,44559969.21',
+    #     '3,2,华中地区毛利率,26.45%',
+    #     '3,2,华中地区营业收入比上年同期增减,-51.24%',
+    #     '3,2,华中地区营业成本比上年同期增减,-55.13%',
+    #     '3,2,华中地区毛利率比上年同期增减,增加6.38个百分点',
+    #     '3,2,西北地区营业收入,58618014.32',
+    #     '3,2,西北地区营业成本,42844719.81',
+    #     '3,2,西北地区毛利率,26.91%',
+    #     '3,2,西北地区营业收入比上年同期增减,178.59%',
+    #     '3,2,西北地区营业成本比上年同期增减,173.62%',
+    #     '3,2,西北地区毛利率比上年同期增减,增加1.33个百分点',
+    #     '3,2,西南地区营业收入,22500216.36',
+    #     '3,2,西南地区营业成本,17602001.52',
+    #     '3,2,西南地区毛利率,21.77%',
+    #     '3,2,西南地区营业收入比上年同期增减,-23.74%',
+    #     '3,2,西南地区营业成本比上年同期增减,-17.89%',
+    #     '3,2,西南地区毛利率比上年同期增减,减少5.57个百分点',
+    #     '3,2,合计营业收入,1003535799.51',
+    #     '3,2,合计营业成本,810779075.89',
+    #     '5,2,经营活动产生的现金流量净额2023年,-44713443.44',
+    #     '5,2,经营活动产生的现金流量净额2022年,-53241071.45',
+    #     '5,2,经营活动产生的现金流量净额变动比例,16.02%',
+    #     '5,2,投资活动产生的现金流量净额2023年,-88649920.50',
+    #     '5,2,投资活动产生的现金流量净额2022年,-94251741.15',
+    #     '5,2,投资活动产生的现金流量净额变动比例,5.94%',
+    #     '5,2,筹资活动产生的现金流量净额2023年,96607197.26',
+    #     '5,2,筹资活动产生的现金流量净额2022年,210537586.22',
+    #     '5,2,筹资活动产生的现金流量净额变动比例,-54.11%'
+    # ]
+    
+    # client = MilvusClient(
+    #     uri="http://localhost:19530"
+    # )
+    # vector_obj = embed_with_str('2023年营业收入')
+    # vector = vector_obj.output["embeddings"][0]["embedding"]
+    # data = [vector]
+    # res = client.search(
+    #     collection_name="zzb_measure", # Replace with the actual name of your collection
+    #     # Replace with your query vector
+    #     data=data,
+    #     limit=1, # Max. number of search results to return
+    #     search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
+    #     output_fields=["measure_name","measure_value"]
+    # )
+
+    # # Convert the output to a formatted JSON string
+    # result = json.dumps(res, indent=4, ensure_ascii=False)
+    # print(result)
+
+    # insert_measure_data(client, measure_data)
+    # text = '营业收入第一季度（1-3月份）'
+    # new_text = re.sub(r'（[^)]*）', '',text)
+    # print(new_text)
--- a/zzb_data_word/word_title.py
+++ b/zzb_data_word/word_title.py
@ -0,0 +1,16 @@
+
+import re
+def get_parent_table_pages(title_array, file_id):
+    parent_table_pages_local = {}
+    parent_table_pages_local[file_id] = []
+    print(f'{file_id}:{len(title_array)}')
+    for i in range(len(title_array)):
+        title_obj = title_array[i]
+        title = title_obj['data']
+        if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
+            page_num = title_obj['index']
+            parent_table_pages_local[file_id].append(page_num)
+
+    parent_table_pages = parent_table_pages_local[file_id]
+    return parent_table_pages
+
--- a/zzb_data_word/zzb_logger.py
+++ b/zzb_data_word/zzb_logger.py
@ -0,0 +1,39 @@
+import time
+import logging
+import logging.handlers
+import os
+
+# 如果日志文件夹不存在，则创建
+log_dir = "log-day"  # 日志存放文件夹名称
+log_path = os.getcwd() + os.sep + log_dir
+if not os.path.isdir(log_path):
+    os.makedirs(log_path)
+
+# logging初始化工作
+logging.basicConfig()
+
+# myapp的初始化工作
+applog = logging.getLogger(__name__)
+applog.setLevel(logging.INFO)
+
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+
+# 添加TimedRotatingFileHandler
+# 定义一个1天换一次log文件的handler
+# 保留3个旧log文件
+timefilehandler = logging.handlers.TimedRotatingFileHandler(
+    log_dir + os.sep + "sec.log",
+    when='D',
+    interval=1,
+    backupCount=3
+)
+# 设置后缀名称，跟strftime的格式一样
+timefilehandler.suffix = "%Y-%m-%d_%H-%M-%S.log"
+# timefilehandler.suffix = "%Y-%m-%d.log"
+
+formatter = logging.Formatter('%(asctime)s|%(name)-12s: %(levelname)-8s %(message)s')
+console_handler.setFormatter(formatter)
+timefilehandler.setFormatter(formatter)
+applog.addHandler(timefilehandler)
+applog.addHandler(console_handler)