Compare commits
1 Commits
pdf-dsw-20
...
main
Author | SHA1 | Date |
---|---|---|
|
a3d5164123 |
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding">
|
||||
<file url="file://$PROJECT_DIR$/zzb_data_word/log-day/sec.log" charset="GBK" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,169 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="156">
|
||||
<item index="0" class="java.lang.String" itemvalue="pandas" />
|
||||
<item index="1" class="java.lang.String" itemvalue="protobuf" />
|
||||
<item index="2" class="java.lang.String" itemvalue="decorator" />
|
||||
<item index="3" class="java.lang.String" itemvalue="TA-Lib" />
|
||||
<item index="4" class="java.lang.String" itemvalue="websocket-client" />
|
||||
<item index="5" class="java.lang.String" itemvalue="altgraph" />
|
||||
<item index="6" class="java.lang.String" itemvalue="tzlocal" />
|
||||
<item index="7" class="java.lang.String" itemvalue="Babel" />
|
||||
<item index="8" class="java.lang.String" itemvalue="testpath" />
|
||||
<item index="9" class="java.lang.String" itemvalue="pickleshare" />
|
||||
<item index="10" class="java.lang.String" itemvalue="psycopg2" />
|
||||
<item index="11" class="java.lang.String" itemvalue="defusedxml" />
|
||||
<item index="12" class="java.lang.String" itemvalue="lml" />
|
||||
<item index="13" class="java.lang.String" itemvalue="PyQt5-sip" />
|
||||
<item index="14" class="java.lang.String" itemvalue="javascripthon" />
|
||||
<item index="15" class="java.lang.String" itemvalue="ipython-genutils" />
|
||||
<item index="16" class="java.lang.String" itemvalue="tables" />
|
||||
<item index="17" class="java.lang.String" itemvalue="rqdatac" />
|
||||
<item index="18" class="java.lang.String" itemvalue="Pygments" />
|
||||
<item index="19" class="java.lang.String" itemvalue="PyQt5" />
|
||||
<item index="20" class="java.lang.String" itemvalue="bleach" />
|
||||
<item index="21" class="java.lang.String" itemvalue="graphviz" />
|
||||
<item index="22" class="java.lang.String" itemvalue="jsonschema" />
|
||||
<item index="23" class="java.lang.String" itemvalue="pywin32" />
|
||||
<item index="24" class="java.lang.String" itemvalue="qtconsole" />
|
||||
<item index="25" class="java.lang.String" itemvalue="terminado" />
|
||||
<item index="26" class="java.lang.String" itemvalue="portalocker" />
|
||||
<item index="27" class="java.lang.String" itemvalue="Werkzeug" />
|
||||
<item index="28" class="java.lang.String" itemvalue="aniso8601" />
|
||||
<item index="29" class="java.lang.String" itemvalue="mxnet" />
|
||||
<item index="30" class="java.lang.String" itemvalue="jupyter-client" />
|
||||
<item index="31" class="java.lang.String" itemvalue="QDarkStyle" />
|
||||
<item index="32" class="java.lang.String" itemvalue="ipykernel" />
|
||||
<item index="33" class="java.lang.String" itemvalue="nbconvert" />
|
||||
<item index="34" class="java.lang.String" itemvalue="attrs" />
|
||||
<item index="35" class="java.lang.String" itemvalue="pefile" />
|
||||
<item index="36" class="java.lang.String" itemvalue="psutil" />
|
||||
<item index="37" class="java.lang.String" itemvalue="pyinstaller-hooks-contrib" />
|
||||
<item index="38" class="java.lang.String" itemvalue="PyQtWebEngine" />
|
||||
<item index="39" class="java.lang.String" itemvalue="simplejson" />
|
||||
<item index="40" class="java.lang.String" itemvalue="prettytable" />
|
||||
<item index="41" class="java.lang.String" itemvalue="jedi" />
|
||||
<item index="42" class="java.lang.String" itemvalue="helpdev" />
|
||||
<item index="43" class="java.lang.String" itemvalue="pyqtgraph" />
|
||||
<item index="44" class="java.lang.String" itemvalue="dukpy" />
|
||||
<item index="45" class="java.lang.String" itemvalue="futu-api" />
|
||||
<item index="46" class="java.lang.String" itemvalue="matplotlib" />
|
||||
<item index="47" class="java.lang.String" itemvalue="humanize" />
|
||||
<item index="48" class="java.lang.String" itemvalue="PyMySQL" />
|
||||
<item index="49" class="java.lang.String" itemvalue="msgpack" />
|
||||
<item index="50" class="java.lang.String" itemvalue="idna" />
|
||||
<item index="51" class="java.lang.String" itemvalue="rsa" />
|
||||
<item index="52" class="java.lang.String" itemvalue="vnstation" />
|
||||
<item index="53" class="java.lang.String" itemvalue="pandocfilters" />
|
||||
<item index="54" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="55" class="java.lang.String" itemvalue="pyasn1" />
|
||||
<item index="56" class="java.lang.String" itemvalue="requests" />
|
||||
<item index="57" class="java.lang.String" itemvalue="pyrsistent" />
|
||||
<item index="58" class="java.lang.String" itemvalue="gluoncv" />
|
||||
<item index="59" class="java.lang.String" itemvalue="jdcal" />
|
||||
<item index="60" class="java.lang.String" itemvalue="jupyter" />
|
||||
<item index="61" class="java.lang.String" itemvalue="seaborn" />
|
||||
<item index="62" class="java.lang.String" itemvalue="zipp" />
|
||||
<item index="63" class="java.lang.String" itemvalue="prompt-toolkit" />
|
||||
<item index="64" class="java.lang.String" itemvalue="tigeropen" />
|
||||
<item index="65" class="java.lang.String" itemvalue="itsdangerous" />
|
||||
<item index="66" class="java.lang.String" itemvalue="pyee" />
|
||||
<item index="67" class="java.lang.String" itemvalue="deap" />
|
||||
<item index="68" class="java.lang.String" itemvalue="websockets" />
|
||||
<item index="69" class="java.lang.String" itemvalue="ipywidgets" />
|
||||
<item index="70" class="java.lang.String" itemvalue="scipy" />
|
||||
<item index="71" class="java.lang.String" itemvalue="tornado" />
|
||||
<item index="72" class="java.lang.String" itemvalue="pyppeteer" />
|
||||
<item index="73" class="java.lang.String" itemvalue="Send2Trash" />
|
||||
<item index="74" class="java.lang.String" itemvalue="et-xmlfile" />
|
||||
<item index="75" class="java.lang.String" itemvalue="incremental" />
|
||||
<item index="76" class="java.lang.String" itemvalue="mistune" />
|
||||
<item index="77" class="java.lang.String" itemvalue="cnocr" />
|
||||
<item index="78" class="java.lang.String" itemvalue="future" />
|
||||
<item index="79" class="java.lang.String" itemvalue="mpmath" />
|
||||
<item index="80" class="java.lang.String" itemvalue="jupyter-console" />
|
||||
<item index="81" class="java.lang.String" itemvalue="macropy3" />
|
||||
<item index="82" class="java.lang.String" itemvalue="pycryptodome" />
|
||||
<item index="83" class="java.lang.String" itemvalue="pytz" />
|
||||
<item index="84" class="java.lang.String" itemvalue="setproctitle" />
|
||||
<item index="85" class="java.lang.String" itemvalue="webencodings" />
|
||||
<item index="86" class="java.lang.String" itemvalue="Pillow" />
|
||||
<item index="87" class="java.lang.String" itemvalue="Twisted" />
|
||||
<item index="88" class="java.lang.String" itemvalue="traitlets" />
|
||||
<item index="89" class="java.lang.String" itemvalue="Automat" />
|
||||
<item index="90" class="java.lang.String" itemvalue="pywinpty" />
|
||||
<item index="91" class="java.lang.String" itemvalue="python-dateutil" />
|
||||
<item index="92" class="java.lang.String" itemvalue="Brotli" />
|
||||
<item index="93" class="java.lang.String" itemvalue="Click" />
|
||||
<item index="94" class="java.lang.String" itemvalue="cycler" />
|
||||
<item index="95" class="java.lang.String" itemvalue="MarkupSafe" />
|
||||
<item index="96" class="java.lang.String" itemvalue="twisted-iocpsupport" />
|
||||
<item index="97" class="java.lang.String" itemvalue="constantly" />
|
||||
<item index="98" class="java.lang.String" itemvalue="mongoengine" />
|
||||
<item index="99" class="java.lang.String" itemvalue="appdirs" />
|
||||
<item index="100" class="java.lang.String" itemvalue="docopt" />
|
||||
<item index="101" class="java.lang.String" itemvalue="ibapi" />
|
||||
<item index="102" class="java.lang.String" itemvalue="pymssql" />
|
||||
<item index="103" class="java.lang.String" itemvalue="pyzmq" />
|
||||
<item index="104" class="java.lang.String" itemvalue="certifi" />
|
||||
<item index="105" class="java.lang.String" itemvalue="entrypoints" />
|
||||
<item index="106" class="java.lang.String" itemvalue="peewee" />
|
||||
<item index="107" class="java.lang.String" itemvalue="pyparsing" />
|
||||
<item index="108" class="java.lang.String" itemvalue="sympy" />
|
||||
<item index="109" class="java.lang.String" itemvalue="notebook" />
|
||||
<item index="110" class="java.lang.String" itemvalue="hyperlink" />
|
||||
<item index="111" class="java.lang.String" itemvalue="win-unicode-console" />
|
||||
<item index="112" class="java.lang.String" itemvalue="kiwisolver" />
|
||||
<item index="113" class="java.lang.String" itemvalue="zope.interface" />
|
||||
<item index="114" class="java.lang.String" itemvalue="APScheduler" />
|
||||
<item index="115" class="java.lang.String" itemvalue="backcall" />
|
||||
<item index="116" class="java.lang.String" itemvalue="PySocks" />
|
||||
<item index="117" class="java.lang.String" itemvalue="widgetsnbextension" />
|
||||
<item index="118" class="java.lang.String" itemvalue="numexpr" />
|
||||
<item index="119" class="java.lang.String" itemvalue="pyecharts-snapshot" />
|
||||
<item index="120" class="java.lang.String" itemvalue="jupyter-core" />
|
||||
<item index="121" class="java.lang.String" itemvalue="pyecharts-jupyter-installer" />
|
||||
<item index="122" class="java.lang.String" itemvalue="Delorean" />
|
||||
<item index="123" class="java.lang.String" itemvalue="SQLAlchemy" />
|
||||
<item index="124" class="java.lang.String" itemvalue="wcwidth" />
|
||||
<item index="125" class="java.lang.String" itemvalue="importlib-metadata" />
|
||||
<item index="126" class="java.lang.String" itemvalue="Jinja2" />
|
||||
<item index="127" class="java.lang.String" itemvalue="simplegeneric" />
|
||||
<item index="128" class="java.lang.String" itemvalue="stomp.py" />
|
||||
<item index="129" class="java.lang.String" itemvalue="pywin32-ctypes" />
|
||||
<item index="130" class="java.lang.String" itemvalue="pyecharts" />
|
||||
<item index="131" class="java.lang.String" itemvalue="urllib3" />
|
||||
<item index="132" class="java.lang.String" itemvalue="Flask" />
|
||||
<item index="133" class="java.lang.String" itemvalue="coverage" />
|
||||
<item index="134" class="java.lang.String" itemvalue="pyinstaller" />
|
||||
<item index="135" class="java.lang.String" itemvalue="pymongo" />
|
||||
<item index="136" class="java.lang.String" itemvalue="six" />
|
||||
<item index="137" class="java.lang.String" itemvalue="parso" />
|
||||
<item index="138" class="java.lang.String" itemvalue="pytesseract" />
|
||||
<item index="139" class="java.lang.String" itemvalue="nbformat" />
|
||||
<item index="140" class="java.lang.String" itemvalue="ipython" />
|
||||
<item index="141" class="java.lang.String" itemvalue="jqdatasdk" />
|
||||
<item index="142" class="java.lang.String" itemvalue="python-rapidjson" />
|
||||
<item index="143" class="java.lang.String" itemvalue="packaging" />
|
||||
<item index="144" class="java.lang.String" itemvalue="pyecharts-javascripthon" />
|
||||
<item index="145" class="java.lang.String" itemvalue="prometheus-client" />
|
||||
<item index="146" class="java.lang.String" itemvalue="jupyter-echarts-pypkg" />
|
||||
<item index="147" class="java.lang.String" itemvalue="chardet" />
|
||||
<item index="148" class="java.lang.String" itemvalue="tqdm" />
|
||||
<item index="149" class="java.lang.String" itemvalue="thriftpy2" />
|
||||
<item index="150" class="java.lang.String" itemvalue="colorama" />
|
||||
<item index="151" class="java.lang.String" itemvalue="vnpy" />
|
||||
<item index="152" class="java.lang.String" itemvalue="ply" />
|
||||
<item index="153" class="java.lang.String" itemvalue="Flask-RESTful" />
|
||||
<item index="154" class="java.lang.String" itemvalue="openpyxl" />
|
||||
<item index="155" class="java.lang.String" itemvalue="python-docx" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.8" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/pdfcode.iml" filepath="$PROJECT_DIR$/.idea/pdfcode.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,12 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/zzb_data_prod" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,125 @@
|
|||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
import os
|
||||
import mysql.connector
|
||||
from zzb_data_prod.config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
|
||||
def get_time():
|
||||
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
|
||||
def check_port(host, port):
|
||||
try:
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(5)
|
||||
result = sock.connect_ex((host, port))
|
||||
sock.close()
|
||||
return result == 0 # 返回布尔值,表示端口是否可用
|
||||
except Exception as e:
|
||||
print(f"[{get_time()}] 端口检测异常: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def restart_service():
|
||||
try:
|
||||
subprocess.run(["bash", "/root/docker/milvus/standalone_embed.sh", "restart"])
|
||||
print(f"[{get_time()}] milvus服务重启成功")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"[{get_time()}] 服务重启失败: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def start_application_process():
|
||||
"""启动8000端口对应的应用进程"""
|
||||
try:
|
||||
# 先尝试停止可能存在的旧进程
|
||||
time.sleep(2) # 给进程停止的时间
|
||||
# 进入应用目录
|
||||
# 启动新进程
|
||||
subprocess.run(
|
||||
["bash", "/root/pdf_parser/restart_app.sh"],
|
||||
)
|
||||
print(f"[{get_time()}] 应用进程(8000端口)已成功启动")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"[{get_time()}] 启动应用进程失败: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def get_local_ip():
|
||||
try:
|
||||
# 创建一个 UDP 套接字
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
# 连接到一个外部地址(这里使用 Google 的公共 DNS 服务器)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
# 获取本地套接字的 IP 地址
|
||||
local_ip = s.getsockname()[0]
|
||||
except Exception as e:
|
||||
print(f"[{get_time()}] 获取内网 IP 失败: {e}")
|
||||
local_ip = "127.0.0.1" # 如果失败,返回本地回环地址
|
||||
finally:
|
||||
s.close() # 关闭套接字
|
||||
return local_ip
|
||||
|
||||
def monitor_port_8000():
|
||||
"""监控8000端口,如果异常则启动应用进程"""
|
||||
print(f"[{get_time()}] 检查8000端口状态...")
|
||||
port_available = check_port("127.0.0.1", 8000)
|
||||
|
||||
if not port_available:
|
||||
print(f"[{get_time()}] 检测到8000端口异常,尝试启动应用进程...")
|
||||
success = start_application_process()
|
||||
|
||||
if success:
|
||||
# 启动后检查是否成功
|
||||
time.sleep(10) # 等待应用启动
|
||||
if check_port("127.0.0.1", 8000):
|
||||
print(f"[{get_time()}] 应用进程启动成功,8000端口已正常")
|
||||
# INSERT_YOUR_CODE
|
||||
# 检查并修改数据库字段
|
||||
try:
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
local_ip = get_local_ip()
|
||||
sql = f"update model_ip set status = 0 where ip = '{local_ip}:8000';"
|
||||
print(f"[{get_time()}] 执行sql: {sql}")
|
||||
cursor.execute(sql)
|
||||
conn.commit()
|
||||
print(f"[{get_time()}] 数据库字段已成功修改")
|
||||
except Exception as e:
|
||||
print(f"[{get_time()}] 修改数据库字段失败: {str(e)}")
|
||||
finally:
|
||||
try:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
print(f"[{get_time()}] 应用进程启动后,8000端口仍未正常")
|
||||
else:
|
||||
print(f"[{get_time()}] 8000端口状态正常")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f"[{get_time()}] 启动Milvus监控服务")
|
||||
port_ok = check_port("127.0.0.1", 19530)
|
||||
if not port_ok:
|
||||
print("检测到Milvus服务异常,尝试重启...")
|
||||
restart_service()
|
||||
|
||||
print(f"[{get_time()}] 启动 8000 端口监控服务")
|
||||
# 开始监控8000端口,每60秒检查一次
|
||||
monitor_port_8000()
|
||||
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
#!/bin/bash
|
||||
|
||||
|
||||
|
||||
# 切换到 /root/docker/milvus 目录
|
||||
cd /root/docker/milvus || { echo "无法进入目录 /root/docker/milvus"; exit 1; }
|
||||
# 运行 standalone_embed.sh restart 指令
|
||||
bash standalone_embed.sh restart
|
||||
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
# 目标目录(根据实际路径修改)
|
||||
TARGET_DIR="/root/pdf_parser/pdf"
|
||||
LOG_FILE="/root/pdf_parser/logs/pdf_clean.log"
|
||||
|
||||
# 创建日志目录
|
||||
mkdir -p "$(dirname "$LOG_FILE")"
|
||||
|
||||
# 带时间戳的日志函数
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 检查目标目录是否存在
|
||||
if [ ! -d "$TARGET_DIR" ]; then
|
||||
log "错误:目标目录不存在 $TARGET_DIR"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 执行清理并记录
|
||||
log "开始清理PDF文件..."
|
||||
find "$TARGET_DIR" -iname "*.pdf" -print0 | while IFS= read -r -d $'\0' file; do
|
||||
log "删除文件: $file"
|
||||
rm -f "$file"
|
||||
done
|
||||
|
||||
log "清理完成,共删除 $(find "$TARGET_DIR" -iname "*.pdf" | wc -l) 个残留文件"
|
||||
|
||||
# 设置工作目录和日志路径
|
||||
WORK_DIR="/root/pdf_parser/zzb_data_prod"
|
||||
LOG_FILE="$WORK_DIR/app.log"
|
||||
|
||||
# 终止现有进程
|
||||
pids=$(ps -ef | grep app.py | grep -v grep | awk '{print $2}')
|
||||
if [ -n "$pids" ]; then
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] 正在停止现有进程: $pids"
|
||||
kill -9 $pids
|
||||
else
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] 未找到正在运行的进程"
|
||||
fi
|
||||
|
||||
# 进入工作目录
|
||||
cd $WORK_DIR || { echo "无法进入目录 $WORK_DIR"; exit 1; }
|
||||
|
||||
# 启动服务
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] 启动服务..."
|
||||
nohup python3 app.py > $LOG_FILE 2>&1 &
|
||||
|
||||
# 等待进程启动
|
||||
sleep 2
|
||||
|
||||
# 检查进程状态
|
||||
new_pid=$(ps -ef | grep app.py | grep -v grep | awk '{print $2}')
|
||||
if [ -n "$new_pid" ]; then
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] 服务启动成功,进程ID: $new_pid"
|
||||
echo "--------------------------------"
|
||||
tail -n 10 $LOG_FILE
|
||||
else
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] 服务启动失败!"
|
||||
echo "--------------------------------"
|
||||
cat $LOG_FILE
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,99 @@
|
|||
# Requires transformers>=4.51.0
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from torch import Tensor
|
||||
from modelscope import AutoTokenizer, AutoModel
|
||||
import datetime
|
||||
import dashscope
|
||||
from http import HTTPStatus
|
||||
|
||||
|
||||
dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
|
||||
|
||||
def embed_with_str(input):
|
||||
retry = 0
|
||||
max_retry = 5
|
||||
t = 0.2
|
||||
while retry < max_retry:
|
||||
# time.sleep(t)
|
||||
#阿里接口限流
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=dashscope.TextEmbedding.Models.text_embedding_v2,
|
||||
input=input)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
return resp
|
||||
elif resp.status_code == 429:
|
||||
logger.info(f'触发限流,等待{t}秒后重试')
|
||||
retry += 1
|
||||
t+=0.1
|
||||
else:
|
||||
logger.error(f'请求失败,状态码:{resp.status_code}')
|
||||
return None
|
||||
logger.error('重试超过上限')
|
||||
return None
|
||||
|
||||
def last_token_pool(last_hidden_states: Tensor,
|
||||
attention_mask: Tensor) -> Tensor:
|
||||
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
||||
if left_padding:
|
||||
return last_hidden_states[:, -1]
|
||||
else:
|
||||
sequence_lengths = attention_mask.sum(dim=1) - 1
|
||||
batch_size = last_hidden_states.shape[0]
|
||||
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
||||
|
||||
|
||||
def get_detailed_instruct(task_description: str, query: str) -> str:
|
||||
return f'Instruct: {task_description}\nQuery:{query}'
|
||||
|
||||
# Each query must come with a one-sentence instruction that describes the task
|
||||
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||
|
||||
queries = [
|
||||
get_detailed_instruct(task, 'What is the capital of China?'),
|
||||
get_detailed_instruct(task, 'Explain gravity')
|
||||
]
|
||||
# No need to add instruction for retrieval documents
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
|
||||
]
|
||||
input_texts = queries + documents
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
|
||||
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
|
||||
|
||||
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
|
||||
# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
|
||||
print(datetime.datetime.now())
|
||||
max_length = 8192
|
||||
|
||||
# Tokenize the input texts
|
||||
batch_dict = tokenizer(
|
||||
input_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
batch_dict.to(model.device)
|
||||
outputs = model(**batch_dict)
|
||||
|
||||
|
||||
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
||||
|
||||
# normalize embeddings
|
||||
embeddings = F.normalize(embeddings, p=2, dim=1)
|
||||
print(f"=========embeddings=========")
|
||||
print(datetime.datetime.now())
|
||||
|
||||
scores = (embeddings[:2] @ embeddings[2:].T)
|
||||
print(len(embeddings.tolist()[0]))
|
||||
# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
|
||||
|
||||
|
||||
vector_obj = embed_with_str(input_texts)
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
print(len(vector))
|
|
@ -1,18 +1,21 @@
|
|||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
|
||||
from config import MILVUS_CLIENT
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from log_config import logger
|
||||
|
||||
def create_partition_by_hour(current_hour):
|
||||
def create_partition_by_hour(file_id):
|
||||
# 连接到 Milvus 服务器
|
||||
connections.connect("default",uri=MILVUS_CLIENT)
|
||||
connections.connect(uri=MILVUS_CLIENT)
|
||||
# 获取集合
|
||||
collection_name = "pdf_measure_v4"
|
||||
collection = Collection(collection_name)
|
||||
|
||||
# 创建当前小时的分区
|
||||
partition_name = f"partition_{current_hour}"
|
||||
|
||||
# 创建当前id的分区
|
||||
partition_name = f"partition_{file_id}"
|
||||
if not collection.has_partition(partition_name):
|
||||
collection.create_partition(partition_name)
|
||||
print(f"Created partition: {partition_name}")
|
||||
logger.info(f"Created partition: {partition_name}")
|
||||
partition = collection.partition(partition_name)
|
||||
partition.load()
|
||||
|
||||
|
@ -25,44 +28,13 @@ def create_partition_by_hour(current_hour):
|
|||
pre_partition = collection.partition(name)
|
||||
pre_partition.release()
|
||||
collection.drop_partition(name)
|
||||
print(f"Partition '{name}' deleted.")
|
||||
logger.info(f"Partition '{name}' deleted.")
|
||||
connections.disconnect("default")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# data = []
|
||||
# measure_data = {}
|
||||
# vector = [0.61865162262130161] * 1536
|
||||
# measure_data['vector'] = vector
|
||||
# measure_data['table_num'] = int(2)
|
||||
# measure_data['table_index'] = int(2)
|
||||
# measure_data['measure_name'] = "234234"
|
||||
# measure_data['measure_value'] = "23432"
|
||||
# measure_data['measure_unit'] = "123423"
|
||||
# measure_data['file_id'] = "100000"
|
||||
#
|
||||
# data.append(measure_data)
|
||||
# res = client.insert(
|
||||
# collection_name=collection_name,
|
||||
# data=data,
|
||||
# partition_name=partition_name
|
||||
# )
|
||||
|
||||
# filter_str = 'file_id == "'+"2122"+'"'
|
||||
# res = client.search(
|
||||
# collection_name=collection_name, # Replace with the actual name of your collection
|
||||
# # Replace with your query vector
|
||||
# data=data,
|
||||
# limit=3, # Max. number of search results to return
|
||||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
# output_fields=["measure_name", "measure_value", "table_num", "table_index", "measure_unit"],
|
||||
# filter=filter_str,
|
||||
# partition_name=partition_name
|
||||
# )
|
||||
# print(f"============================={res}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -98,4 +70,4 @@ def create_partition_by_hour(current_hour):
|
|||
# "params": {"nlist": 128}
|
||||
# }
|
||||
# collection.create_index(field_name="vector", index_params=index_params)
|
||||
# collection.load()
|
||||
# collection.load()
|
||||
|
|
|
@ -14,10 +14,10 @@ import db_service
|
|||
import threading
|
||||
from Mil_unit import create_partition_by_hour
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from log_config import logger
|
||||
|
||||
app = FastAPI()
|
||||
cpu_count = os.cpu_count()
|
||||
cpu_count = 4
|
||||
job_queue = queue.Queue()
|
||||
|
||||
# 定义请求体模型
|
||||
|
@ -26,11 +26,11 @@ class FileItem(BaseModel):
|
|||
file_id: str
|
||||
|
||||
def run_job():
|
||||
#判断是否有任务在执行
|
||||
#判断是否有任务在执行
|
||||
if_run = True
|
||||
|
||||
if job_queue.empty():
|
||||
print(f"job_queue为空: {file_path}")
|
||||
logger.info(f"job_queue为空: {file_path}")
|
||||
if_run = False
|
||||
|
||||
if if_run:
|
||||
|
@ -43,29 +43,24 @@ def run_job():
|
|||
try:
|
||||
#下载pdf
|
||||
start_time = time.time()
|
||||
print(f"开始启动文件解析任务: {file_path}")
|
||||
logger.info(f"开始启动文件解析任务: {file_path}")
|
||||
if file_path.startswith('http'):
|
||||
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
|
||||
try:
|
||||
file_info = pdf_title.create_text_outline(file_path,file_id)
|
||||
except Exception as e:
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
logger.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
logger.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
logger.info(f"{file_id}运行失败: {e}")
|
||||
continue_execution = False
|
||||
if continue_execution:
|
||||
print(cpu_count)
|
||||
parent_table_pages = file_info['parent_table_pages']
|
||||
print('parent_table_pages的值是')
|
||||
print(parent_table_pages)
|
||||
|
||||
# page_nums = [
|
||||
# '1-3',
|
||||
# '4-6',
|
||||
# ]
|
||||
print(cpu_count)
|
||||
print('测试')
|
||||
#
|
||||
db_service.delete_MYSQL_DB_APP(file_id)
|
||||
db_service.delete_MYSQL_DB(file_id)
|
||||
|
||||
if continue_execution:
|
||||
parent_table_pages = file_info['parent_table_pages']
|
||||
page_num = file_info['page_count']
|
||||
if page_num < cpu_count:
|
||||
p_count = page_num
|
||||
|
@ -73,7 +68,6 @@ def run_job():
|
|||
p_count = cpu_count
|
||||
|
||||
for i in range(p_count):
|
||||
# for i in range(2):
|
||||
page_list.append({
|
||||
'type': 'table',
|
||||
'page_num': file_info['split_parts']['table_split_parts'][i],
|
||||
|
@ -88,8 +82,8 @@ def run_job():
|
|||
|
||||
# 通知开始解析
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
|
||||
print(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
print(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
logger.info(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
logger.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
parser_start_time = time.time()
|
||||
processes = []
|
||||
time_dispatch_job = time.time()
|
||||
|
@ -98,39 +92,36 @@ def run_job():
|
|||
p = Process(target=main.dispatch_job, args=(job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
#time_dispatch_job_end = time.time()
|
||||
#process_time = time_dispatch_job_end - time_dispatch_job
|
||||
#db_service.process_time(file_id,'1',process_time)
|
||||
|
||||
print('等待所有子任务完成,任务ID:', file_id)
|
||||
|
||||
logger.info(f'等待所有子任务完成,任务ID:{file_id}')
|
||||
for p in processes:
|
||||
p.join()
|
||||
print('pdf解析任务完成任务完成,任务ID:', file_id)
|
||||
logger.info(f'pdf解析任务完成任务完成,任务ID:{file_id}')
|
||||
time_dispatch_job_end = time.time()
|
||||
process_time = time_dispatch_job_end - time_dispatch_job
|
||||
db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
|
||||
parser_end_time = time.time()
|
||||
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
logger.info(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
#这里做一步判断,看看是否还要继续。
|
||||
if db_service.file_type_check(file_id):
|
||||
print("文本较真表格生成已结束")
|
||||
logger.info(f"文本较真表格生成已结束")
|
||||
else:
|
||||
# 通知抽取指标
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
|
||||
print(f'通知开始抽取指标url:{file_id}:{response.url}')
|
||||
print(f'通知开始抽取指标状态:{file_id}:{response.text}')
|
||||
logger.info(f'通知开始抽取指标url:{file_id}:{response.url}')
|
||||
logger.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
|
||||
|
||||
parser_start_time = time.time()
|
||||
print('开始表格指标抽取,任务ID:', file_id)
|
||||
logger.info(f'开始表格指标抽取,任务ID:{file_id}')
|
||||
time_start = time.time()
|
||||
|
||||
|
||||
# 获取当前时间
|
||||
now = datetime.now()
|
||||
current_hour = now.strftime("%Y%m%d%H")
|
||||
partition_name = f"partition_{current_hour}"
|
||||
|
||||
partition_name = f"partition_{file_id}"
|
||||
# 判断是否创建新的分区
|
||||
create_partition_by_hour(current_hour)
|
||||
create_partition_by_hour(file_id)
|
||||
time.sleep(10)
|
||||
# 判断是否为3季报
|
||||
|
||||
if db_service.file_type_check_v2(file_id) == 3:
|
||||
|
@ -138,17 +129,17 @@ def run_job():
|
|||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
print('表格指标抽取完成,任务ID:', file_id)
|
||||
logger.info(f'表格指标抽取完成,任务ID:{file_id}')
|
||||
parser_end_time = time.time()
|
||||
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
logger.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
print('启动这个指标归一化任务ID-修改测试:', file_id)
|
||||
logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
|
||||
time_update = time.time()
|
||||
main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
|
||||
|
||||
print('归一化完成任务ID:', file_id)
|
||||
logger.info(f'归一化完成任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
|
@ -158,25 +149,25 @@ def run_job():
|
|||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
print('表格指标抽取完成,任务ID:', file_id)
|
||||
logger.info(f'表格指标抽取完成,任务ID:{file_id}')
|
||||
parser_end_time = time.time()
|
||||
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
logger.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
print('启动这个指标归一化任务ID-修改测试:', file_id)
|
||||
logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
|
||||
time_update = time.time()
|
||||
main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
|
||||
|
||||
print('归一化完成任务ID:', file_id)
|
||||
logger.info(f'归一化完成任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
logger.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
logger.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
|
@ -191,17 +182,17 @@ def run_job():
|
|||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"Response status code: {response.status_code}")
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
logger.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
logger.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
logger.info(f"Response status code: {response.status_code}")
|
||||
logger.info(f"{file_id}运行失败: {e}")
|
||||
finally:
|
||||
print(f"任务 {file_id} 完成,运行状态:{job_status}")
|
||||
logger.info(f"任务 {file_id} 完成,运行状态:{job_status}")
|
||||
|
||||
#pdf_company_0824.name_code_fix(file_id,file_path)
|
||||
#print('公司名与编码填充完毕')
|
||||
else:
|
||||
print("有任务运行中,需要等待.....")
|
||||
logger.info(f"有任务运行中,需要等待.....")
|
||||
|
||||
def parse_pdf_route(fileItem: FileItem):
|
||||
|
||||
|
@ -210,7 +201,7 @@ def parse_pdf_route(fileItem: FileItem):
|
|||
'file_path' : fileItem.file_path,
|
||||
'file_id' : fileItem.file_id
|
||||
})
|
||||
print(f"增加 {fileItem.file_id} 到队列.")
|
||||
logger.info(f"增加 {fileItem.file_id} 到队列.")
|
||||
|
||||
threading.Thread(target=run_job, args=()).start()
|
||||
|
||||
|
@ -221,16 +212,43 @@ app.post("/parser/start",
|
|||
summary="解析Pdf文件",
|
||||
)(parse_pdf_route)
|
||||
|
||||
def get_local_ip():
|
||||
try:
|
||||
# 创建一个 UDP 套接字
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
# 连接到一个外部地址(这里使用 Google 的公共 DNS 服务器)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
# 获取本地套接字的 IP 地址
|
||||
local_ip = s.getsockname()[0]
|
||||
except Exception as e:
|
||||
logger.info(f"获取内网 IP 失败: {e}")
|
||||
local_ip = "127.0.0.1" # 如果失败,返回本地回环地址
|
||||
finally:
|
||||
s.close() # 关闭套接字
|
||||
return local_ip
|
||||
|
||||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
# 服务器启动服务
|
||||
# import uvicorn
|
||||
# uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
try:
|
||||
# 获取内网IP
|
||||
ip = get_local_ip()
|
||||
logger.info(f"内网IP地址: {ip}")
|
||||
# 假设 config.NOTIFY_ADDR 是一个字符串,我们可以使用 rpartition 方法来替换最后一个 / 后面的值
|
||||
url = config.NOTIFY_ADDR.rpartition('/')[0] + '/restart?address'
|
||||
address = f"{ip}:{config.PORT}"
|
||||
logger.info(address)
|
||||
response = requests.get(url, params={'address':address})
|
||||
logger.info(f"Response status code: {response.status_code}")
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutdown server")
|
||||
|
||||
# 本地调试任务
|
||||
job_queue.put({
|
||||
'file_path' : '3.pdf',
|
||||
'file_id' : '2122'
|
||||
})
|
||||
|
||||
run_job()
|
||||
# job_queue.put({
|
||||
# 'file_path' : '1.pdf',
|
||||
# 'file_id' : '2222222'
|
||||
# })
|
||||
|
||||
# run_job()
|
||||
|
|
Binary file not shown.
|
@ -1,28 +1,28 @@
|
|||
MILVUS_CLIENT='http://124.70.129.232:19530'
|
||||
#MILVUS_CLIENT='http://60.204.228.154:19530'
|
||||
MYSQL_HOST = '121.37.185.246'
|
||||
MILVUS_CLIENT='http://127.0.0.1:19530'
|
||||
MILVUS_HOST = '127.0.0.1'
|
||||
MILVUS_PORT = 19530
|
||||
MYSQL_HOST = '10.127.2.207'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial'
|
||||
MYSQL_PASSWORD = 'financial_8000'
|
||||
MYSQL_DB = 'financial_report'
|
||||
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
|
||||
NOTIFY_ADDR_DIS = 'http://127.0.0.1:8100/api/tenant/info/notify'
|
||||
REDIS_HOST = '123.60.153.169'
|
||||
MYSQL_USER = 'financial_prod'
|
||||
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB = 'financial_report_test'
|
||||
NOTIFY_ADDR = 'http://10.127.2.206:8101/api/tenant/report/notify'
|
||||
FILE_PATH = '/root/pdf_parser/pdf/'
|
||||
REDIS_HOST = '10.127.2.206'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
FILE_PATH = '/root/pdf_parser/pdf/'
|
||||
PORT = 8000
|
||||
MEASURE_COUNT = 8
|
||||
MEASURE_COUNT = 4
|
||||
|
||||
|
||||
MYSQL_HOST_APP = '121.37.185.246'
|
||||
MYSQL_HOST_APP = '10.127.2.207'
|
||||
MYSQL_PORT_APP = 3306
|
||||
MYSQL_USER_APP = 'financial'
|
||||
MYSQL_PASSWORD_APP = 'financial_8000'
|
||||
MYSQL_DB_APP = 'financial_report'
|
||||
MYSQL_USER_APP = 'financial_prod'
|
||||
MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB_APP = 'financial_report_test'
|
||||
|
||||
|
||||
api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
|
||||
# 'sk-3cc9e1601f654c149d2a4e99ef8a8946'
|
||||
|
||||
|
||||
#MYSQL_HOST_APP = '192.168.0.201'
|
||||
#MYSQL_PORT_APP = 3306
|
||||
#MYSQL_USER_APP = 'root'
|
||||
#MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
#MYSQL_DB_APP = 'financial_report_prod'
|
||||
|
|
|
@ -10,6 +10,9 @@ from pymilvus import MilvusClient
|
|||
import mysql.connector
|
||||
import threading
|
||||
import redis
|
||||
from log_config import logger
|
||||
|
||||
|
||||
|
||||
measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发","货币资金","应收账款","存货","固定资产","在建工程","商誉","短期借款","应付账款","合同负债","长期借款","营业成本"]
|
||||
# 解析大模型抽取的指标,并插入到数据库
|
||||
|
@ -133,9 +136,9 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
|
|||
WHERE file_id = %s AND page_num = %s AND table_index = %s
|
||||
'''
|
||||
cursor.execute(update_query, (unit, file_id, page_num, table_index))
|
||||
#print(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
|
||||
logger.info(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
|
||||
else:
|
||||
print(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
|
||||
logger.info(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
|
||||
else:
|
||||
# 插入新的记录
|
||||
insert_query = '''
|
||||
|
@ -145,7 +148,7 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
|
|||
'''
|
||||
data_to_insert = (file_id, page_num, table_index, unit)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
#print(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
|
||||
logger.info(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
@ -190,6 +193,16 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
select_query_first = '''
|
||||
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
|
||||
FROM ori_measure_list t1
|
||||
left join
|
||||
measure_config_first_quarter t2
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
|
||||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
select_query_half_year = '''
|
||||
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
|
||||
FROM ori_measure_list t1
|
||||
|
@ -205,59 +218,73 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
FROM ori_measure_list t1
|
||||
left join
|
||||
measure_config_third_quarter t2
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
|
||||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
|
||||
if report_type == 1:
|
||||
if report_type == 1:#半年报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_half_year)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print(f'update_ori_measure方法走的是半年报')
|
||||
elif report_type == 3:
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是半年报')
|
||||
elif report_type == 2: # 一季报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是一季报')
|
||||
elif report_type == 3: # 三季报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print(f'update_ori_measure方法走的是三季报')
|
||||
else:
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是三季报')
|
||||
else:# 年报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print(f'update_ori_measure方法走的是全年报')
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是全年报')
|
||||
start_time = time.time()
|
||||
for record in records:
|
||||
data_to_update = (record[0], record[1], record[2], file_id)
|
||||
cursor.execute(update_query, data_to_update)
|
||||
conn.commit()
|
||||
end_time = time.time()
|
||||
print(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
|
||||
#更新measure_list表,增加此次文件的显示指标
|
||||
start_time = time.time()
|
||||
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
if report_type == 0:
|
||||
if report_type == 0:#全年报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
select distinct measure_id,measure_name, %s,%s,%s from measure_config
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
elif report_type == 3:
|
||||
elif report_type == 2:# 一季报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
select distinct measure_id,measure_name, %s,%s,%s from measure_config_first_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
elif report_type == 3:# 三季报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
select distinct measure_id,measure_name, %s,%s,%s from measure_config_third_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
else:
|
||||
else:# 半年报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
|
@ -269,13 +296,13 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
cursor.execute(insert_query, data_to_update)
|
||||
conn.commit()
|
||||
end_time = time.time()
|
||||
print(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
|
||||
|
||||
def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name,):
|
||||
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
print('Run task %s (%s)...' % (record_range, os.getpid()))
|
||||
print(f"插入数据 {len(records)}")
|
||||
logger.info(f'Run task {record_range} ({os.getpid()})...')
|
||||
logger.info(f"插入数据 {len(records)}")
|
||||
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
|
@ -332,11 +359,12 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
|
||||
cursor_app.execute(select_parent_query)
|
||||
parent_records = cursor_app.fetchall()
|
||||
#print(f"before: {parent_table_pages}")
|
||||
|
||||
|
||||
for parent_record in parent_records:
|
||||
parent_id = parent_record[0]
|
||||
parent_table_pages.append(int(parent_id))
|
||||
#print(f"after: {parent_table_pages}")
|
||||
|
||||
|
||||
#表格上方文字黑名单关键词的页码和表格下标转成数组
|
||||
table_index_array = []
|
||||
|
@ -348,15 +376,19 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
measure_index_array = []
|
||||
cursor_app.execute(select_measure_index_query, (file_id,))
|
||||
measure_index_records = cursor_app.fetchall()
|
||||
print("Executing SQL:", select_measure_index_query)
|
||||
print("With file_id:", file_id)
|
||||
logger.info(f"Executing SQL:{select_measure_index_query}")
|
||||
logger.info(f"With file_id:{file_id}")
|
||||
for measure_index_record in measure_index_records:
|
||||
measure_index_array.append(measure_index_record[0])
|
||||
print(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
|
||||
logger.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
|
||||
#print(f'黑名单的值是{parent_table_pages}和{table_index_array}')
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
|
||||
if str(report_type) == "2":
|
||||
table_index_array = []
|
||||
measure_index_array = []
|
||||
|
||||
client = MilvusClient(
|
||||
uri=MILVUS_CLIENT,
|
||||
)
|
||||
|
@ -370,6 +402,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_measure_id = record[3]
|
||||
measure_id = record[4]
|
||||
measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id)
|
||||
|
||||
|
||||
measure_list = ast.literal_eval(measure_vector)
|
||||
data = [measure_list]
|
||||
filter_str = 'file_id == "'+file_id+'"'
|
||||
|
@ -383,10 +417,10 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
filter=filter_str,
|
||||
partition_name=partition_name
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Convert the output to a formatted JSON string
|
||||
# for i in range(len(res[0])):
|
||||
|
||||
for i in range(len(res[0])):
|
||||
|
||||
vector_distance = float(res[0][i]["distance"])
|
||||
|
@ -411,17 +445,18 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
if utils.check_pdf_measure_black_list(pdf_measure):
|
||||
continue
|
||||
if f"{table_num}_{table_index}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
#if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
print(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
|
||||
logger.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
if vector_distance > distance and table_num not in parent_table_pages:
|
||||
#检测规则开始
|
||||
#判断抽取指标和财报指标周期是否相同
|
||||
ori_period = utils.get_period_type(ori_measure_name, report_year)
|
||||
pdf_period = utils.get_period_type(pdf_measure, report_year)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第1处{ori_period}和{pdf_period}')
|
||||
logger.info(f'第1处{ori_period}和{pdf_period}')
|
||||
if(ori_period != pdf_period):
|
||||
continue
|
||||
|
||||
|
@ -429,7 +464,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
start_ori_period = utils.get_start_period_type(ori_measure_name)
|
||||
start_pdf_period = utils.get_start_period_type(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第2处{start_ori_period}和{start_pdf_period}')
|
||||
logger.info(f'第2处{start_ori_period}和{start_pdf_period}')
|
||||
if(start_ori_period != start_pdf_period):
|
||||
continue
|
||||
|
||||
|
@ -437,7 +472,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_season_type = utils.get_season_flag(ori_measure_name)
|
||||
pdf_season_type = utils.get_season_flag(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第3处{ori_season_type}和{pdf_season_type}')
|
||||
logger.info(f'第3处{ori_season_type}和{pdf_season_type}')
|
||||
if(ori_season_type != pdf_season_type):
|
||||
continue
|
||||
|
||||
|
@ -445,7 +480,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_kf_type = utils.get_kf_flag(ori_measure_name)
|
||||
pdf_kf_type = utils.get_kf_flag(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第4处{ori_kf_type}和{pdf_kf_type}')
|
||||
logger.info(f'第4处{ori_kf_type}和{pdf_kf_type}')
|
||||
if(ori_kf_type != pdf_kf_type):
|
||||
continue
|
||||
|
||||
|
@ -453,7 +488,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_type = utils.get_percent_flag(ori_measure_name)
|
||||
pdf_type = utils.get_percent_flag(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第5处{ori_type}和{pdf_type}')
|
||||
logger.info(f'第5处{ori_type}和{pdf_type}')
|
||||
if(ori_type != pdf_type):
|
||||
continue
|
||||
|
||||
|
@ -461,7 +496,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_growth_type = utils.get_percent_growth(ori_measure_name)
|
||||
pdf_growth_type = utils.get_percent_growth(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第6处{ori_growth_type}和{pdf_growth_type}')
|
||||
logger.info(f'第6处{ori_growth_type}和{pdf_growth_type}')
|
||||
if(ori_growth_type != pdf_growth_type):
|
||||
continue
|
||||
|
||||
|
@ -526,12 +561,12 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
unit = unit_records[0][0]
|
||||
else:
|
||||
unit = '元'
|
||||
|
||||
|
||||
data_to_insert = (file_id, file_name, "table", int(table_num), int(table_index), ori_measure_id, ori_measure_name, measure_value, create_time, create_time, vector_distance, pdf_measure,measure_id,measure_name,unit)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
finally:
|
||||
parent_table_pages = []
|
||||
client.close()
|
||||
|
@ -550,6 +585,10 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
select_query_first_quarter = '''
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
select_query_half_year = '''
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year
|
||||
where year = '{year}'
|
||||
|
@ -574,8 +613,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
cursor.execute(select_query_half_year)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print('insert_table_measure_from_vector_async_process方法走的半年报')
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的半年报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
|
@ -583,13 +622,27 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array, partition_name))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 2:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first_quarter)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的一季报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 3:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print('insert_table_measure_from_vector_async_process方法走的三季报')
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的三季报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
|
@ -603,8 +656,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print('insert_table_measure_from_vector_async_process方法走的全年报')
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的全年报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
|
@ -613,13 +666,13 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
print('等待所有子任务完成,任务ID:', file_id)
|
||||
logger.info(f'等待所有子任务完成,任务ID:{file_id}' )
|
||||
for p in processes:
|
||||
p.join()
|
||||
print('所有子任务完成,任务ID:', file_id)
|
||||
print('启动指标归一化任务ID:', file_id)
|
||||
logger.info(f'所有子任务完成,任务ID:{file_id}')
|
||||
logger.info(f'启动指标归一化任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
print(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
|
||||
|
||||
def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,file_name):
|
||||
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
@ -646,7 +699,7 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
|
|||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
|
@ -708,9 +761,9 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
|
|||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
end_time = time.time()
|
||||
print(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
|
@ -720,6 +773,7 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
(file_id, page_num, content)
|
||||
VALUES (%s, %s, %s)
|
||||
'''
|
||||
|
||||
for table in table_info:
|
||||
try:
|
||||
data=[]
|
||||
|
@ -730,6 +784,12 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
measure_list = table['measure_list']
|
||||
for measure in measure_list:
|
||||
measure_name = measure['measure_name']
|
||||
|
||||
# 需要跳过的一些指标
|
||||
black_list = ["营业总成本"]
|
||||
if any(black in measure_name for black in black_list):
|
||||
continue
|
||||
|
||||
measure_value = measure['measure_value'].replace("(", "").replace(")", "")
|
||||
measure_name = utils.get_clean_text(measure_name)
|
||||
measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉
|
||||
|
@ -745,7 +805,9 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
|
||||
measure_unit = measure['measure_unit']
|
||||
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
|
||||
|
||||
vector_obj = utils.embed_with_str(measure_name_1)
|
||||
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
measure_data = {}
|
||||
measure_data['vector'] = vector
|
||||
|
@ -773,7 +835,7 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
measure_value = match.group(2)
|
||||
if crease_type == '减少' or crease_type == '下降':
|
||||
measure_value = f'-{match.group(2)}'
|
||||
|
||||
|
||||
vector_obj = utils.embed_with_str(measure_name_1)
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
measure_data = {}
|
||||
|
@ -800,18 +862,18 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
data=data,
|
||||
partition_name=partition_name
|
||||
)
|
||||
logger.info(f"向量插入结束")
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
|
||||
def runing_job():
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
select_query = '''
|
||||
|
@ -824,7 +886,6 @@ def runing_job():
|
|||
return False
|
||||
|
||||
def insert_pdf_parse_process(parser_info,conn,cursor):
|
||||
|
||||
# 执行SQL语句,插入数据
|
||||
insert_query = '''
|
||||
INSERT INTO pdf_parse_process
|
||||
|
@ -839,58 +900,78 @@ def insert_pdf_parse_process(parser_info,conn,cursor):
|
|||
data_to_insert = (file_id, page_num, page_count, content, type)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def delete_MYSQL_DB_APP(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
def delete_database(conn,cursor,file_id):
|
||||
try:
|
||||
truncate_query = [
|
||||
"delete from measure_parse_process where file_id = %s;",
|
||||
"delete from measure_parser_info where file_id = %s;",
|
||||
"delete from pdf_parse_process where file_id = %s;",
|
||||
"delete from table_unit_info where file_id = %s;",
|
||||
# "delete from a where file_id = %s;",
|
||||
# "delete from b where file_id = %s;",
|
||||
]
|
||||
#file_id = file_id
|
||||
for truncate in truncate_query:
|
||||
cursor.execute(truncate,(file_id,))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(f'删除失败,原因是{e}')
|
||||
def delete_to_run(conn,cursor,file_id):
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
try:
|
||||
truncate_query = [
|
||||
"delete from ori_measure_list where file_id = %s;",
|
||||
"delete from measure_list where file_id = %s;",
|
||||
"delete from check_measure_list where file_id = %s;",
|
||||
"delete from check_measure_detail_list where file_id = %s;",
|
||||
# "delete from table_unit_info where file_id = %s;",
|
||||
# "delete from pdf_parse_process where file_id = %s;",
|
||||
# "delete from table_unit_info where file_id = %s;",
|
||||
# "delete from a where file_id = %s;",
|
||||
# "delete from b where file_id = %s;",
|
||||
"delete from measure_parse_process where file_id = %s;",
|
||||
"delete from measure_parser_info where file_id = %s;",
|
||||
"delete from pdf_parse_process where file_id = %s;",
|
||||
"delete from table_unit_info where file_id = %s;",
|
||||
]
|
||||
#file_id = file_id
|
||||
for truncate in truncate_query:
|
||||
cursor.execute(truncate,(file_id,))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(f'删除失败,原因是{e}')
|
||||
logger.info(f'删除失败,原因是{e}')
|
||||
|
||||
def delete_MYSQL_DB(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
try:
|
||||
truncate_query = [
|
||||
|
||||
"delete from measure_list where file_id = %s;",
|
||||
"delete from check_measure_list where file_id = %s;",
|
||||
"delete from check_measure_detail_list where file_id = %s;",
|
||||
"delete from measure_parse_process where file_id = %s;",
|
||||
"delete from measure_parser_info where file_id = %s;",
|
||||
"delete from pdf_parse_process where file_id = %s;",
|
||||
"delete from table_unit_info where file_id = %s;",
|
||||
]
|
||||
#file_id = file_id
|
||||
for truncate in truncate_query:
|
||||
cursor.execute(truncate,(file_id,))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.info(f'删除失败,原因是{e}')
|
||||
|
||||
def insert_pdf_text_info(table_info,conn,cursor):
|
||||
|
||||
# 执行SQL语句,插入数据
|
||||
|
||||
insert_query = '''
|
||||
INSERT INTO pdf_text_info
|
||||
(file_id, page_num, text)
|
||||
VALUES (%s, %s, %s)
|
||||
'''
|
||||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
page_num = table_info['page_num']
|
||||
text = table_info['text']
|
||||
data_to_insert = (file_id, page_num, text)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def process_time(file_id,type,time,start_time,end_time):
|
||||
conn = mysql.connector.connect(
|
||||
|
@ -911,6 +992,7 @@ def process_time(file_id,type,time,start_time,end_time):
|
|||
data_insert = (file_id,type,time,start_time,end_time)
|
||||
cursor.execute(insert_query,data_insert)
|
||||
conn.commit()
|
||||
|
||||
def batch_insert_page_text_nocheck(table_info, conn, cursor):
|
||||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
|
@ -923,6 +1005,7 @@ def batch_insert_page_text_nocheck(table_info, conn, cursor):
|
|||
data_to_insert = [(file_id, page_num, text) for text in text_lines]
|
||||
cursor.executemany(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def batch_insert_page_text(table_info, conn, cursor):
|
||||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
|
@ -945,6 +1028,7 @@ def batch_insert_page_text(table_info, conn, cursor):
|
|||
else:
|
||||
pass
|
||||
conn.commit()
|
||||
|
||||
def file_type_check(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
|
@ -965,6 +1049,7 @@ def file_type_check(file_id):
|
|||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def file_type_check_v2(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
|
@ -989,10 +1074,10 @@ def file_type_check_v2(file_id):
|
|||
|
||||
def pdf_title_insert_mysql(file_id,title_array):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor(buffered=True)
|
||||
for item in title_array:
|
||||
|
@ -1003,13 +1088,12 @@ def pdf_title_insert_mysql(file_id,title_array):
|
|||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_file_info_from_mysql(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
#cursor = conn.cursor(buffered=True)
|
||||
cursor = conn.cursor(dictionary=True)
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
#报错提示
|
||||
import paramiko
|
||||
import time
|
||||
import threading
|
||||
|
||||
# 执行命令的函数
|
||||
def execute_commands_on_server(hostname, username, password, host):
|
||||
try:
|
||||
# 连接到服务器
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname=hostname, username=username, password=password)
|
||||
|
||||
# 执行命令
|
||||
shell = client.invoke_shell()
|
||||
#启动docker
|
||||
shell.send("cd /root/pdf_parser/pdf\n")
|
||||
time.sleep(1)
|
||||
shell.send("rm -f *.pdf\n")
|
||||
time.sleep(10)
|
||||
shell.send("rm -f *.PDF\n")
|
||||
time.sleep(10)
|
||||
# 读取输出
|
||||
output = shell.recv(2048).decode()
|
||||
print(f"Output from {hostname}:\n{output}")
|
||||
|
||||
except paramiko.SSHException as e:
|
||||
print(f"SSH connection error with {hostname}: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
# 创建线程函数
|
||||
def thread_function(server):
|
||||
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
|
||||
|
||||
# 服务器列表
|
||||
# servers = [
|
||||
# {'hostname': 'server1.example.com', 'username': 'user1', 'password': 'pass1', 'host': 'host1'},
|
||||
# {'hostname': 'server2.example.com', 'username': 'user2', 'password': 'pass2', 'host': 'host2'},
|
||||
# # 添加更多服务器
|
||||
# ]
|
||||
servers = [
|
||||
#{'hostname': '124.70.129.232', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器'},
|
||||
# {'hostname': '1.94.179.121', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器'},#废弃
|
||||
|
||||
#旧10台
|
||||
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器1'},
|
||||
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器2'},
|
||||
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器3'},
|
||||
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器4'},
|
||||
|
||||
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器5'},
|
||||
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器6'},#都往这里存
|
||||
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器7'},
|
||||
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器8'},
|
||||
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器9'},
|
||||
{'hostname': '123.60.28.83', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器10'},
|
||||
#新10台
|
||||
{'hostname': '192.168.0.19', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器1'},
|
||||
{'hostname': '192.168.0.53', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器2'},
|
||||
{'hostname': '192.168.0.150', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器3'},
|
||||
{'hostname': '192.168.0.210', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器4'},
|
||||
|
||||
{'hostname': '192.168.0.129', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器5'},
|
||||
{'hostname': '192.168.0.24', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器6'},
|
||||
{'hostname': '192.168.0.250', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器7'},
|
||||
{'hostname': '192.168.0.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器8'},
|
||||
{'hostname': '192.168.0.86', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器9'},
|
||||
{'hostname': '192.168.0.88', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器10'},
|
||||
]
|
||||
|
||||
# 创建并启动线程
|
||||
threads = []
|
||||
for server in servers:
|
||||
thread = threading.Thread(target=thread_function, args=(server,))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
# 等待所有线程完成
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
print("All commands executed.")
|
|
@ -0,0 +1,246 @@
|
|||
import pandas as pd
|
||||
import mysql.connector
|
||||
import utils
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
import re
|
||||
import redis
|
||||
|
||||
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
|
||||
# 读取第一个 Excel 文件
|
||||
df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
# 连接到 MySQL 数据库
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 插入数据到 measure_create_config 表
|
||||
insert_query = '''
|
||||
INSERT INTO measure_create_config
|
||||
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
show_measure = str(data['指标'])
|
||||
same_mean_measure = str(data['同义表述'])
|
||||
period_measure = str(data['周期'])
|
||||
change_measure = str(data['变动'])
|
||||
black_list = str(data['黑名单词'])
|
||||
config_id = utils.get_md5(show_measure)
|
||||
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
|
||||
cursor.execute(insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 读取第二个 Excel 文件
|
||||
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
period_list = df_period.to_dict(orient='records')
|
||||
|
||||
# 插入数据到 measure_create_period 表
|
||||
period_insert_query = '''
|
||||
INSERT INTO measure_create_period
|
||||
(period_name, same_mean_period)
|
||||
VALUES (%s, %s)
|
||||
'''
|
||||
for data in period_list:
|
||||
period_name = str(data['标准表述'])
|
||||
same_mean_period = str(data['同义表述'])
|
||||
insert_query_data = (period_name, same_mean_period)
|
||||
cursor.execute(period_insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 查询数据库
|
||||
data_query = '''
|
||||
SELECT * FROM measure_create_config WHERE delete_status = 0
|
||||
'''
|
||||
period_query = '''
|
||||
SELECT * FROM measure_create_period
|
||||
'''
|
||||
|
||||
cursor.execute(data_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
cursor.execute(period_query)
|
||||
period_list = cursor.fetchall()
|
||||
|
||||
# 输出到文件
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
for data in data_list:
|
||||
config_id = data[0]
|
||||
show_measure = data[1]
|
||||
same_mean_measure = data[2]
|
||||
period_measure = data[3]
|
||||
change_measure = data[4]
|
||||
same_mean_measure_arr = []
|
||||
period_measure_arr = []
|
||||
change_measure_arr = []
|
||||
|
||||
if same_mean_measure != 'nan':
|
||||
same_mean_measure_arr = same_mean_measure.split(',')
|
||||
same_mean_measure_arr.append(show_measure)
|
||||
if period_measure != 'nan':
|
||||
period_measure_arr = period_measure.split(',')
|
||||
if change_measure != 'nan':
|
||||
change_measure_arr = change_measure.split(',')
|
||||
|
||||
for c in change_measure_arr:
|
||||
period_measure_arr.append(c)
|
||||
|
||||
for x in period_measure_arr:
|
||||
if x in change_measure_arr:
|
||||
show_name = show_measure + x
|
||||
else:
|
||||
show_name = x + show_measure
|
||||
for y in same_mean_measure_arr:
|
||||
if x in change_measure:
|
||||
parser_name = y + x
|
||||
else:
|
||||
parser_name = x + y
|
||||
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
for p in period_list:
|
||||
period_exra_name = p[0]
|
||||
period_exra_value = p[1]
|
||||
if period_exra_name in x:
|
||||
for v in period_exra_value.split(','):
|
||||
if x in change_measure:
|
||||
parser_name = y + x.replace(period_exra_name, v)
|
||||
else:
|
||||
parser_name = x.replace(period_exra_name, v) + y
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
# 根据老指标配置表生成新指标配置表
|
||||
def create_new_config(conn, cursor, table_name,old_year,new_year):
|
||||
|
||||
select_query = f'''
|
||||
SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
|
||||
FROM {table_name}
|
||||
WHERE year = '{old_year}'
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
ori_measure_name = data[3]
|
||||
if re.match(r'^\d{4}',ori_measure_name):
|
||||
year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
|
||||
year += 1
|
||||
ori_measure_name = str(year) + ori_measure_name[4:]
|
||||
insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
|
||||
cursor.execute(insert_query, insert_data)
|
||||
conn.commit()
|
||||
|
||||
def measure_config_to_db(conn, cursor, table_name):
|
||||
year_list = ["2021","2022","2023","2024","2025"]
|
||||
for year in year_list:
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year)
|
||||
VALUES (%s, %s, %s, %s,%s,%s,%s)
|
||||
'''
|
||||
check_query = f'''
|
||||
SELECT ori_measure_id FROM {table_name}
|
||||
WHERE year = '{year}'
|
||||
'''
|
||||
# 新增指标
|
||||
lines = [
|
||||
f"当期营业收入,{year}年第一季度营业收入",
|
||||
f"当期归母净利润,{year}年第一季度归母净利润",
|
||||
f"当期扣非净利润,{year}年第一季度扣非净利润",
|
||||
f"当期经营活动现金流净额,{year}年第一季度经营活动现金流净额",
|
||||
f"当期筹资活动现金流净额,{year}年第一季度筹资活动现金流净额",
|
||||
f"当期投资活动现金流净额,{year}年第一季度投资活动现金流净额",
|
||||
f"当期非经常性损益,{year}年第一季度非经常性损益",
|
||||
f"当期基本每股收益,{year}年第一季度基本每股收益",
|
||||
f"当期稀释每股收益,{year}年第一季度稀释每股收益",
|
||||
f"当期加权平均净资产收益率,{year}年第一季度加权平均净资产收益率",
|
||||
f"当期扣非加权平均净资产收益率,{year}年第一季度扣非加权平均净资产收益率",
|
||||
f"当期营业成本 ,{year}年第一季度营业成本",
|
||||
f"当期销售费用,{year}年第一季度销售费用",
|
||||
f"当期管理费用,{year}年第一季度管理费用",
|
||||
f"当期财务费用,{year}年第一季度财务费用",
|
||||
f"当期研发费用,{year}年第一季度研发费用"]
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
|
||||
# 判断数据库中是否有数据
|
||||
cursor.execute(check_query)
|
||||
check_records = cursor.fetchall()
|
||||
if any(record[0] == ori_measure_id for record in check_records):
|
||||
continue
|
||||
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor,table_name):
|
||||
from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = f'''
|
||||
SELECT ori_measure_id,ori_measure_name FROM {table_name}
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
print(f"总计{len(records)}条数据")
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
if __name__ == "__main__":
|
||||
#需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
|
||||
|
||||
# process_excel_and_db(
|
||||
# 'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
|
||||
# 'F:\\11_pdf\\period_1.xlsx',#period文件
|
||||
# 'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
|
||||
# )
|
||||
from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST_APP,
|
||||
user=MYSQL_USER_APP,
|
||||
password=MYSQL_PASSWORD_APP,
|
||||
database=MYSQL_DB_APP
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
#file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
|
||||
|
||||
|
||||
|
||||
# 更新第一季度的measure_vector
|
||||
table_name = 'measure_config'
|
||||
# 写入mysql
|
||||
# measure_config_to_db(conn, cursor, table_name)
|
||||
create_new_config(conn, cursor, table_name,'2023','2024')
|
||||
# 插入redies
|
||||
insert_measure_vector(conn,cursor,table_name)
|
||||
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import logging
|
||||
import os
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
def setup_logging():
|
||||
# 创建logs目录(如果不存在)
|
||||
log_dir = 'logs'
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
# 配置根日志记录器
|
||||
root_logger = logging.getLogger()
|
||||
|
||||
# 如果已经有handlers,先移除它们以防重复
|
||||
if root_logger.handlers:
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
root_logger.setLevel(logging.INFO)
|
||||
|
||||
# 创建格式化器
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# 创建文件处理器
|
||||
file_handler = RotatingFileHandler(
|
||||
os.path.join(log_dir, 'app.log'),
|
||||
maxBytes=10*1024*1024, # 10MB
|
||||
backupCount=5
|
||||
)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# 添加处理器到根日志记录器
|
||||
root_logger.addHandler(file_handler)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# 设置propagate=False以防止日志消息向上传播
|
||||
for logger_name in logging.root.manager.loggerDict:
|
||||
logger = logging.getLogger(logger_name)
|
||||
logger.propagate = False
|
||||
|
||||
return root_logger
|
||||
|
||||
logger = setup_logging()
|
|
@ -22,8 +22,7 @@ from multiprocessing import Process
|
|||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
|
||||
|
||||
|
||||
from log_config import logger
|
||||
|
||||
'''
|
||||
已知发现问题:
|
||||
|
@ -40,7 +39,7 @@ from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Colle
|
|||
|
||||
|
||||
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入|计入当期损益的政府补助'
|
||||
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
|
||||
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目名称|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|(?<=\d)美元|\美元(?=\d)|日元'
|
||||
MUILT_PATTERN = '调整前'
|
||||
#unit_pattern = re.compile(r'单位[:|:]?(百万元|千万元|亿元|万元|千元|元)')
|
||||
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
|
||||
|
@ -81,7 +80,7 @@ def safe_process_array(func, arr):
|
|||
try:
|
||||
return func(arr)
|
||||
except Exception as e:
|
||||
print(f"这个函数出现了报错{func.__name__}: {e}")
|
||||
logger.info(f"这个函数出现了报错{func.__name__}: {e}")
|
||||
return arr # 返回原数组以便继续后续处理
|
||||
|
||||
#单独针对三季报的资产负债表识别合并问题
|
||||
|
@ -199,7 +198,7 @@ def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期
|
|||
|
||||
def get_table_range(file_path, file_id, pages, tables_range):
|
||||
|
||||
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
logger.info(f'Run task 解析表格--{pages} {os.getpid()}')
|
||||
start = time.time()
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
|
@ -229,6 +228,21 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
page_num = int(t.page)
|
||||
table_index = int(t.order)
|
||||
arr = np.array(t.data)
|
||||
|
||||
|
||||
if page_num != 0:
|
||||
# 表格数据写入
|
||||
line_texts = []
|
||||
for lines in t.data:
|
||||
for line in lines:
|
||||
line_texts.append(line)
|
||||
|
||||
db_service.batch_insert_page_text_nocheck({
|
||||
'file_id': file_id,
|
||||
'page_num' : page_num,
|
||||
'text' : line_texts
|
||||
},conn,cursor)
|
||||
|
||||
arr = safe_process_array(process_array, arr) #部分资产负债表合并问题
|
||||
arr = safe_process_array(process_array_with_annual_comparison, arr) #复杂表格的优化"多个上年同期时处理"
|
||||
arr = safe_process_array(process_array_with_grants, arr) #三季报的非经常损益
|
||||
|
@ -397,6 +411,7 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
pattern = re.findall(PATTERN,arr_str)
|
||||
muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
|
||||
if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
|
||||
|
||||
if not tables_range.get(page_num):
|
||||
tables_range[page_num] = []
|
||||
|
||||
|
@ -421,8 +436,14 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
"data" : new_data,
|
||||
'sort_num' : page_num*1000 - top
|
||||
}},conn_app,cursor_app)
|
||||
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f'camelot解析表格时出现了{e}')
|
||||
logger.info(f'camelot解析表格时出现了{e}')
|
||||
|
||||
|
||||
|
||||
get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app)
|
||||
|
||||
cursor.close()
|
||||
|
@ -432,7 +453,7 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
redis_client.close()
|
||||
|
||||
end = time.time()
|
||||
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
|
||||
def text_in_table(top, tables_range, page_num):
|
||||
if tables_range.get(page_num):
|
||||
|
@ -468,7 +489,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
|
||||
page_start = pages.split('-')[0]
|
||||
page_end = pages.split('-')[1]
|
||||
print(f'pages的值为{pages}')
|
||||
logger.info(f'pages的值为{pages}')
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
record_select = cursor.fetchall()
|
||||
|
@ -513,8 +534,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
line_text = re.sub(r"\s", "", line_text)
|
||||
|
||||
#提取符合要求的文本写入pdf_text_info,用于文本书写错误识别
|
||||
if not utils.pdf_text_flag(line_text):
|
||||
line_texts.append(line_text)
|
||||
# if not utils.pdf_text_flag(line_text):
|
||||
line_texts.append(line_text)
|
||||
#db_service.insert_pdf_text_info({
|
||||
# 'file_id': file_id,
|
||||
# 'page_num' : pagenum+1,
|
||||
|
@ -536,7 +557,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
if text_type in ('page_header','page_footer'):
|
||||
break
|
||||
if pagenum ==44:
|
||||
print(f'line_text在第44页的值有{line_text}')
|
||||
logger.info(f'line_text在第44页的值有{line_text}')
|
||||
#这个对一整页都有用,会去掉很多正确的表
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0 :
|
||||
|
@ -546,10 +567,11 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
'type': 'parent_com',
|
||||
},conn_app,cursor_app)
|
||||
|
||||
|
||||
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
|
||||
table_info = {}
|
||||
|
||||
if utils.check_table_title_black_list(line_text,title_list):
|
||||
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': f"{range['page_num']}_{range['table_index']}",
|
||||
|
@ -613,6 +635,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
table_info = {}
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
||||
logger.info(f'line_text{line_text}')
|
||||
logger.info(f'pagenum{pagenum}')
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': pagenum+2,
|
||||
|
@ -665,8 +689,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
'text' : line_texts
|
||||
},conn,cursor)
|
||||
except Exception as e:
|
||||
print(f'{pagenum}页处理异常')
|
||||
print(e)
|
||||
logger.info(f'{pagenum}页处理异常')
|
||||
logger.info(e)
|
||||
|
||||
|
||||
def get_table_unit_info(file_id,line_text,page_num,table_index):
|
||||
|
@ -725,7 +749,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
uri=MILVUS_CLIENT,
|
||||
)
|
||||
|
||||
print('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
|
||||
logger.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
|
||||
start = time.time()
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
|
@ -738,10 +762,8 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
arr = np.array(t['data'])
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
|
||||
continue
|
||||
row_num , col_num = -1 , -1
|
||||
|
||||
# 使用嵌套循环遍历数组,获取第一个数值位置
|
||||
for i in range(rows):
|
||||
for j in range(cols):
|
||||
|
@ -834,6 +856,8 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
|
||||
redis_client.incr(f'parsed_measure_count_{file_id}')
|
||||
|
||||
|
||||
|
||||
if len(measure_list) > 0:
|
||||
data_dict["measure_list"] = measure_list
|
||||
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
|
||||
|
@ -841,12 +865,12 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
measure_obj.append(data_dict)
|
||||
db_service.insert_measure_data_to_milvus(client,partition_name,measure_obj,cursor_app,conn_app)
|
||||
except Exception as e:
|
||||
print(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
|
||||
print(f"错误是:{e}")
|
||||
logger.info(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
|
||||
logger.info(f"错误是:{e}")
|
||||
end = time.time()
|
||||
print('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
|
||||
logger.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
|
||||
except Exception as e:
|
||||
print(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
|
||||
logger.info(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
|
@ -857,7 +881,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
try:
|
||||
arr = np.array(t['data'])
|
||||
except Exception as e:
|
||||
print(f'这个错误是{e}的arr的值是{arr}')
|
||||
logger.info(f'这个错误是{e}的arr的值是{arr}')
|
||||
finally:
|
||||
redis_client.close()
|
||||
cursor.close()
|
||||
|
@ -877,7 +901,7 @@ def dispatch_job(job_info):
|
|||
get_table_range(path, file_id, page_num, tables_range)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
|
||||
#指标归一化处理
|
||||
|
||||
|
@ -901,18 +925,58 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
|
|||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
print(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service.delete_to_run(conn,cursor,file_id)
|
||||
logger.info(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path, partition_name)
|
||||
|
||||
|
||||
|
||||
|
||||
# #指标归一化处理
|
||||
db_service.update_ori_measure(conn,cursor,file_id)
|
||||
#db_service.delete_database(conn_app,cursor_app,file_id)
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
|
||||
|
||||
# def merge_consecutive_arrays(word_info):
|
||||
# merged_objects = []
|
||||
# temp_list = []
|
||||
|
||||
# for info_obj in word_info:
|
||||
# try:
|
||||
# if info_obj['type'] == 'table':
|
||||
# # 如果对象是表格,将其元素添加到临时列表中
|
||||
# data = info_obj['data']
|
||||
# if not data:
|
||||
# continue
|
||||
# first_row = data[0]
|
||||
# if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0:
|
||||
# temp_list.append(info_obj)
|
||||
# elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
# merged_objects.append(temp_list)
|
||||
# temp_list = []
|
||||
# temp_list.append(info_obj)
|
||||
# elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
# temp_data = temp_list[-1]['data']
|
||||
# temp_data = list(temp_data)
|
||||
# for row in list(info_obj['data']):
|
||||
# temp_data.append(row)
|
||||
# info_obj['data'] = temp_data
|
||||
# temp_list.clear()
|
||||
# temp_list.append(info_obj)
|
||||
|
||||
|
||||
# except Exception as e:
|
||||
|
||||
# applog.error(f"解析数据错误: {e}")
|
||||
|
||||
# if temp_list:
|
||||
# merged_objects.append(temp_list)
|
||||
|
||||
# return merged_objects
|
||||
|
||||
def merge_consecutive_arrays(pdf_info):
|
||||
merged_objects = []
|
||||
temp_array = {}
|
||||
|
@ -941,7 +1005,7 @@ def merge_consecutive_arrays(pdf_info):
|
|||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
#print(info_obj)
|
||||
print(f"解析数据错误: {e}")
|
||||
logger.info(f"解析数据错误: {e}")
|
||||
|
||||
if temp_array:
|
||||
merged_objects.append(temp_array)
|
||||
|
@ -980,7 +1044,7 @@ def merge_consecutive_arrays_v1(pdf_info):
|
|||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
print(f"解析数据错误: {e}")
|
||||
logger.info(f"解析数据错误: {e}")
|
||||
|
||||
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
|
||||
if temp_array:
|
||||
|
@ -1003,9 +1067,13 @@ def start_table_measure_job(file_id,partition_name):
|
|||
cursor_app.execute(select_process_query)
|
||||
records = cursor_app.fetchall()
|
||||
pdf_info = []
|
||||
|
||||
for record in records:
|
||||
pdf_info.append(eval(record[0]))
|
||||
|
||||
try:
|
||||
pdf_info.append(eval(record[0]))
|
||||
except Exception as e:
|
||||
logger.info(f'文本报错{e}')
|
||||
|
||||
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
||||
pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
|
||||
|
||||
|
@ -1017,7 +1085,7 @@ def start_table_measure_job(file_id,partition_name):
|
|||
redis_client.close()
|
||||
|
||||
records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT)
|
||||
print(f'records_range_part识别页码的值为{records_range_parts}')
|
||||
logger.info(f'records_range_part识别页码的值为{records_range_parts}')
|
||||
processes = []
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -83,7 +83,7 @@ def get_page_end(start, depth, title_array):
|
|||
|
||||
def get_file_split(page_count):
|
||||
# 获取 CPU 核数
|
||||
cpu_count = os.cpu_count()
|
||||
cpu_count = 4
|
||||
if page_count < cpu_count:
|
||||
cpu_count = page_count
|
||||
# 使用 divmod() 函数计算除法结果和余数
|
||||
|
@ -168,6 +168,68 @@ def create_text_outline(pdf_path, file_id):
|
|||
|
||||
return file_info
|
||||
|
||||
|
||||
def create_text_outline_disclosure(pdf_path, file_id):
|
||||
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
|
||||
# creating an object
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_info = {}
|
||||
fileReader = PyPDF2.PdfReader(file)
|
||||
page_count = len(fileReader.pages)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'page_count_{file_id}', page_count)
|
||||
|
||||
info = {
|
||||
'page_count': page_count,
|
||||
'all_pages': {},
|
||||
'current_page_id': 1,
|
||||
'padding': 0
|
||||
}
|
||||
|
||||
print('Number of pages: %d' % info['page_count'])
|
||||
|
||||
pages = fileReader.trailer['/Root']['/Pages'].get_object()
|
||||
recursive_numbering(pages, info)
|
||||
#for page_num, page in enumerate(pages['/Kids']):
|
||||
# page_obj = page.getObject()
|
||||
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
|
||||
title_array = get_tree_pages(fileReader.outline, info, 0, [])
|
||||
#db_service.pdf_title_insert_mysql(file_id,title_array)
|
||||
#title_array = db_service.get_file_info_from_mysql(file_id)
|
||||
|
||||
parent_table_pages_local = {}
|
||||
parent_table_pages_local[file_id] = []
|
||||
print(f'{file_id}:{len(title_array)}')
|
||||
for i in range(len(title_array)):
|
||||
title_obj = title_array[i]
|
||||
title = title_obj['title']
|
||||
#print(f'标题分别是{title}')
|
||||
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
|
||||
page_start = title_obj['page_num']
|
||||
depth = title_obj['depth']
|
||||
if i < len(title_array) - 1:
|
||||
page_end = title_array[i+1]['page_num']
|
||||
if title_array[i]['depth'] in [1,2]:
|
||||
page_end = get_page_end(i+1, depth, title_array)
|
||||
else:
|
||||
page_end = page_count
|
||||
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
|
||||
|
||||
#当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
|
||||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
|
||||
file_info['split_parts'] = get_file_split(page_count)
|
||||
|
||||
redis_client.close()
|
||||
|
||||
return file_info
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
|
||||
|
|
|
@ -2,18 +2,18 @@
|
|||
|
||||
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
|
||||
#FILES="/root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py /root/pdf_parser/zzb_data_prod/app.py /root/pdf_parser/zzb_data_prod/main.py /root/pdf_parser/zzb_data_prod/pdf_title.py"
|
||||
FILES="/root/pdf_parser/zzb_data_prod/main.py"
|
||||
FILES="/root/pdf_parser/zzb_data_prod/put_code.sh"
|
||||
DEST_PATH="/root/pdf_parser/zzb_data_prod"
|
||||
|
||||
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
|
||||
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
|
||||
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
|
||||
#监管服务器
|
||||
SERVERS=("192.168.0.108" "192.168.0.131")
|
||||
#SERVERS=("192.168.0.108" "192.168.0.131")
|
||||
#企业服务器
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
|
||||
#两者一起
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
|
||||
SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
|
||||
# 遍历每个服务器并上传文件
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo "Uploading files to $SERVER"
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
import redis
|
||||
from config import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
# 从 MySQL 表中读取数据并写入 Redis
|
||||
def read_from_file_and_write_to_redis(redis_client,ori_measure_id,measure_vector):
|
||||
# Redis 连接配置
|
||||
|
@ -10,7 +13,7 @@ def read_from_redis(redis_client,ori_measure_id):
|
|||
return redis_client.hget('measure_config',ori_measure_id).decode()
|
||||
|
||||
if __name__ == "__main__":
|
||||
redis_client = redis.Redis(host='192.168.0.175', port=6379, password='Xgf_redis', db=6)
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
|
||||
value = read_from_redis(redis_client,"bb3cf43f3dba147373c706c6567b5a")
|
||||
print(value)
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
import redis
|
||||
from config import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
|
@ -10,4 +10,6 @@ pydantic
|
|||
uvicorn
|
||||
redis
|
||||
ghostscript
|
||||
opencv-python-headless
|
||||
opencv-python-headless
|
||||
python-docx
|
||||
docx2pdf
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
--2024-12-27 11:23:36-- https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
|
||||
Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
|
||||
wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
|
|
@ -1,154 +1,14 @@
|
|||
#coding=utf-8
|
||||
import sys,ast
|
||||
from pdfminer.high_level import extract_text
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
import utils
|
||||
import mysql.connector
|
||||
from pymilvus import connections,MilvusClient
|
||||
import json
|
||||
import db_service
|
||||
import ast
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import numpy as np
|
||||
import config
|
||||
import redis_service
|
||||
from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
import main
|
||||
import redis
|
||||
|
||||
def measure_config_to_db(conn,cursor):
|
||||
insert_query = '''
|
||||
INSERT INTO measure_config
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
'''
|
||||
check_query = '''
|
||||
select ori_measure_id from measure_config
|
||||
'''
|
||||
# 打开文本文件
|
||||
with open('/Users/zhengfei/work/zzb_data/measure_config_all.txt', 'r') as file:
|
||||
# 读取所有行到一个列表中
|
||||
lines = file.readlines()
|
||||
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
# 判断数据库中是否有数据
|
||||
# cursor.execute(check_query.format(ori_measure_id=ori_measure_id))
|
||||
# check_records = cursor.fetchall()
|
||||
# if(len(check_records)) > 0:
|
||||
# continue
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor):
|
||||
|
||||
redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
|
||||
def contains_financial_indicators(text):
|
||||
import re
|
||||
# 正则表达式模式匹配千分位格式的数字和百分比
|
||||
pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
|
||||
|
||||
pattern1 = r"\d+(.\d+)+%?"
|
||||
# 使用 re.search 函数查找匹配项
|
||||
match = re.search(pattern1, text)
|
||||
|
||||
# 如果找到匹配项,返回 True,否则返回 False
|
||||
return bool(match)
|
||||
|
||||
def get_clean_text(text):
|
||||
import re
|
||||
pattern = r"\([^)]*?\)"
|
||||
matches = re.findall(pattern, text)
|
||||
for match in matches:
|
||||
# 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
|
||||
month_keywords_found = re.search(r"归属于|扣非", match)
|
||||
if not month_keywords_found:
|
||||
# 如果包含,则从文本中删除该部分
|
||||
text = re.sub(pattern,"", text)
|
||||
else:
|
||||
# 如果不包含,删除所有标点符号和中文数字
|
||||
text = re.sub(r"[^\w\s]", "", text)
|
||||
print(text)
|
||||
|
||||
def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
|
||||
# #通过向量查询指标
|
||||
db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
# #指标归一化处理
|
||||
db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
def print_measure_data(cursor,client):
|
||||
select_query = '''
|
||||
SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
|
||||
where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
ori_measure_name = record[0]
|
||||
measure_name = record[1]
|
||||
ori_measure_id = record[2]
|
||||
measure_vector = redis_service.read_from_redis(ori_measure_id)
|
||||
|
||||
measure_list = ast.literal_eval(measure_vector)
|
||||
data = [measure_list]
|
||||
res = client.search(
|
||||
collection_name="pdf_measure_v4", # Replace with the actual name of your collection
|
||||
# Replace with your query vector
|
||||
data=data,
|
||||
limit=2, # Max. number of search results to return
|
||||
search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
output_fields=["measure_name","measure_value","table_num","table_index"],
|
||||
filter = 'file_id == "64"'
|
||||
)
|
||||
vector_str = measure_name+":"+ori_measure_name
|
||||
# Convert the output to a formatted JSON string
|
||||
for i in range(len(res[0])):
|
||||
|
||||
vector_distance = float(res[0][i]["distance"])
|
||||
vector_measure_name = res[0][i]["entity"]["measure_name"]
|
||||
measure_value = res[0][i]["entity"]["measure_value"]
|
||||
table_num = res[0][i]["entity"]["table_num"]
|
||||
table_index = res[0][i]["entity"]["table_index"]
|
||||
table_num_list = [106]
|
||||
print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
|
||||
# if vector_distance > 0.89 and table_num not in table_num_list:
|
||||
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
|
||||
# if vector_distance > distance and table_num not in table_num_list:
|
||||
# print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
|
||||
|
||||
|
||||
list1 = [['2.将重分类进损益的其他综合收益', '', '-135441.46', '58032.20'], ['(1)权益法下可转损益的其他综合收益', '', '', ''], ['(2)其他债权投资公允价值变动', '', '', ''], ['(3)金融资产重分类计入其他综合收益的金额', '', '', ''], ['(4)其他债权投资信用减值准备', '', '', ''], ['(5)现金流量套期储备', '', '', ''], ['(6)外币财务报表折算差额', '', '-135441.46', '58032.20'], ['(7)其他', '', '', ''], ['(二)归属于少数股东的其他综合收益的税后净额', '', '', ''], ['七、综合收益总额', '', '-154059285.14', '15109700.10'], ['(一)归属于母公司所有者的综合收益总额', '', '-153881248.66', '15109700.10'], ['(二)归属于少数股东的综合收益总额', '', '-178036.48', ''], ['八、每股收益:', '八、每股收益:', '八、每股收益:', '八、每股收益:'], ['(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715'], ['(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714']]
|
||||
# 测试代码
|
||||
if __name__ == "__main__":
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
insert_measure_vector(conn,cursor)
|
||||
for lines in list1:
|
||||
line = list(set(lines))
|
||||
print(line)
|
||||
|
||||
|
||||
|
|
@ -7,6 +7,8 @@ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
|||
import pdfplumber
|
||||
|
||||
import os
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# 创建一个文本提取函数
|
||||
|
||||
|
@ -125,8 +127,8 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
upper_side = element.y1
|
||||
# 从表中提取信息
|
||||
table = extract_table(pdf_path, pagenum, table_num)
|
||||
# print('第'+str(pagenum)+'页第'+str(table_num)+'个表格')
|
||||
# print(table)
|
||||
# log.info('第%s页第%s个表格', str(pagenum), str(table_num))
|
||||
# log.info(table)
|
||||
# 将表信息转换为结构化字符串格式
|
||||
table_string = table_converter(table)
|
||||
# 将表字符串追加到列表中
|
||||
|
@ -148,15 +150,15 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
first_element = True
|
||||
table_num+=1
|
||||
|
||||
print('第'+str(pagenum)+'部分')
|
||||
print('page_text:')
|
||||
print(page_text)
|
||||
#print('line_format:')
|
||||
#print(line_format)
|
||||
#print('text_from_tables:')
|
||||
#print(text_from_tables)
|
||||
#print('page_content:')
|
||||
#print(page_content)
|
||||
log.info('第%s部分', str(pagenum))
|
||||
log.info('page_text:')
|
||||
log.info(page_text)
|
||||
#log.info('line_format:')
|
||||
#log.info(line_format)
|
||||
#log.info('text_from_tables:')
|
||||
#log.info(text_from_tables)
|
||||
#log.info('page_content:')
|
||||
#log.info(page_content)
|
||||
|
||||
# 创建字典的键
|
||||
dctkey = 'Page_'+str(pagenum)
|
||||
|
@ -171,7 +173,7 @@ pdfFileObj.close()
|
|||
|
||||
# 显示页面内容
|
||||
# result = ''.join(text_per_page['Page_0'][4])
|
||||
# print(result)
|
||||
# log.info(result)
|
||||
|
||||
# result1 = ''.join(text_per_page['Page_1'][4])
|
||||
# print(result1)
|
||||
# log.info(result1)
|
|
@ -4,6 +4,9 @@ import PyPDF2
|
|||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextContainer, LTRect
|
||||
import pdfplumber
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
import os
|
||||
|
||||
|
@ -82,7 +85,7 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
|
||||
text_obj['page_num'] = pagenum
|
||||
text_obj['text'] = page_text
|
||||
print("pagenum:",pagenum," text:",page_text)
|
||||
log.info("pagenum: %s text: %s", pagenum, page_text)
|
||||
|
||||
# 打印提取的文本
|
||||
# print(page_obj)
|
||||
# log.info(page_obj)
|
|
@ -1,5 +1,7 @@
|
|||
import os
|
||||
import re
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
from tqdm import tqdm
|
||||
from pdfminer.pdfparser import PDFParser,PDFDocument
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
|
@ -24,7 +26,7 @@ def pdf_parse(pdf_path,txt_path):
|
|||
|
||||
#检测文档是否提供txt转换,不提供就忽略
|
||||
if not doc.is_extractable:
|
||||
print(pdf_path)
|
||||
log.info(pdf_path)
|
||||
raise PDFTextExtractionNotAllowed
|
||||
else:
|
||||
#创建PDF,资源管理器,来共享资源
|
||||
|
@ -48,7 +50,7 @@ def pdf_parse(pdf_path,txt_path):
|
|||
if(isinstance(x,LTTextBoxHorizontal)):
|
||||
with open(txt_path,'a') as f:
|
||||
results = x.get_text()
|
||||
# print(results)
|
||||
# log.info(results)
|
||||
f.write(results +"\n")
|
||||
|
||||
|
||||
|
@ -68,5 +70,5 @@ if __name__ == '__main__':
|
|||
txt_path = save_txt_path+txt_name
|
||||
pdf_parse(pdf_path, txt_path)
|
||||
except:
|
||||
print("转换失败:", pdf_name)
|
||||
log.info("转换失败:%s", pdf_name)
|
||||
continue
|
|
@ -4,6 +4,8 @@ import os
|
|||
import json
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
# 读取PDF
|
||||
import PyPDF2
|
||||
# 分析PDF的layout,提取文本
|
||||
|
@ -230,7 +232,7 @@ def get_measure_from_llm(user_prompt):
|
|||
llm_measure_list = result.split('\n')
|
||||
return llm_measure_list
|
||||
else:
|
||||
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
response.request_id, response.status_code,
|
||||
response.code, response.message
|
||||
))
|
||||
|
@ -270,7 +272,7 @@ def parse_llm_measure_to_db(measure_info,type,conn,cursor):
|
|||
ori_measure_id = get_md5(ori_measure_name)
|
||||
data_to_insert = (file_id, file_name, type, int(page_num), int(table_index), ori_measure_id, ori_measure_name, ori_measure_value, create_time, create_time)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
print(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}")
|
||||
logger.info(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}")
|
||||
|
||||
# 提交事务
|
||||
conn.commit()
|
||||
|
@ -300,7 +302,7 @@ def update_ori_measure(conn,cursor):
|
|||
|
||||
if __name__ == "__main__":
|
||||
start_time = datetime.now()
|
||||
print("开始时间:", start_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
logger.info("开始时间:", start_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
path = "/Users/zhengfei/Desktop/科润智控1.pdf"
|
||||
table_info = get_table_measure(path)
|
||||
|
@ -324,10 +326,10 @@ if __name__ == "__main__":
|
|||
table_index = table_obj['page_num'].split("_")[1]
|
||||
table_measure = ','.join(table_obj['measure_list'])
|
||||
if table_page_num == '3':
|
||||
print(f"第{table_page_num}页表格指标为:{table_measure}")
|
||||
logger.info(f"第{table_page_num}页表格指标为:{table_measure}")
|
||||
table_llm_measure = get_measure_from_llm(table_measure)
|
||||
if table_page_num == '3':
|
||||
print(f"第{table_page_num}页表格llm指标为:{table_llm_measure}")
|
||||
logger.info(f"第{table_page_num}页表格llm指标为:{table_llm_measure}")
|
||||
# table_measure_obj['page_num'] = table_page_num
|
||||
# table_measure_obj['table_index'] = table_index
|
||||
# table_measure_obj['llm_measure'] = table_llm_measure
|
||||
|
@ -352,5 +354,5 @@ if __name__ == "__main__":
|
|||
# parse_llm_measure_to_db(measure_info)
|
||||
# get_measure_from_llm()
|
||||
end_time = datetime.now()
|
||||
print("结束时间:", end_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
logger.info("结束时间:", end_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
#print(pdf_data)
|
|
@ -19,6 +19,8 @@ from pymilvus import MilvusClient
|
|||
#import pdf_title
|
||||
import numpy as np
|
||||
#from multiprocessing import Process
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
|
@ -81,9 +83,9 @@ def get_text_content_test(file_path,file_id,pages,tables_range):
|
|||
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
||||
print('成功识别到了')
|
||||
logger.info('成功识别到了')
|
||||
except Exception as e:
|
||||
print(f"Error processing page {pagenum+1}: {e}")
|
||||
logger.error(f"Error processing page {pagenum+1}: {e}")
|
||||
|
||||
pdf_path = r"combined_v61.pdf"
|
||||
file_id = 1
|
||||
|
|
|
@ -19,6 +19,8 @@ from pymilvus import MilvusClient
|
|||
#import pdf_title
|
||||
import numpy as np
|
||||
#from multiprocessing import Process
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
|
||||
#负责表内一旦出现某个字符,整个表丢弃
|
||||
|
@ -202,7 +204,7 @@ tables_range = {}
|
|||
# print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
def get_table_range_test(file_path, file_id, pages, tables_range):
|
||||
|
||||
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
# conn = mysql.connector.connect(
|
||||
|
@ -295,7 +297,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
|
|||
'table_index' : table_index,
|
||||
'page_num' : page_num,
|
||||
})
|
||||
print(f"tables_range的值是{tables_range}")
|
||||
logger.debug(f"tables_range的值是{tables_range}")
|
||||
|
||||
# db_service.insert_pdf_parse_process({
|
||||
# 'file_id': file_id,
|
||||
|
@ -319,7 +321,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
|
|||
# redis_client.close()
|
||||
|
||||
end = time.time()
|
||||
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
|
||||
|
||||
get_table_range_test(file_path, file_id, pages, tables_range)
|
||||
|
|
|
@ -10,6 +10,12 @@ import requests
|
|||
import config
|
||||
import numpy as np
|
||||
from docx2pdf import convert
|
||||
from config import api_key
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
dashscope.api_key = api_key
|
||||
|
||||
|
||||
def get_md5(str):
|
||||
import hashlib
|
||||
|
@ -20,25 +26,27 @@ def get_md5(str):
|
|||
def embed_with_str(input):
|
||||
retry = 0
|
||||
max_retry = 5
|
||||
t = 0.1
|
||||
t = 0.2
|
||||
while retry < max_retry:
|
||||
#阿里接口限流
|
||||
time.sleep(t)
|
||||
# time.sleep(t)
|
||||
#阿里接口限流
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=dashscope.TextEmbedding.Models.text_embedding_v2,
|
||||
input=input)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
return resp
|
||||
elif resp.status_code == 429:
|
||||
print(f'触发限流,等待{t}秒后重试')
|
||||
logger.info(f'触发限流,等待{t}秒后重试')
|
||||
retry += 1
|
||||
t+=0.1
|
||||
else:
|
||||
print(f'请求失败,状态码:{resp.status_code}')
|
||||
logger.error(f'请求失败,状态码:{resp.status_code}')
|
||||
return None
|
||||
print('重试超过上限')
|
||||
logger.error('重试超过上限')
|
||||
return None
|
||||
|
||||
|
||||
|
||||
#如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。
|
||||
#如果存在季度关键词,就将括号内容替换为季度
|
||||
#如果存在‘±’,就将括号内容替换为同期增减
|
||||
|
@ -89,7 +97,7 @@ def get_clean_text(text):
|
|||
return pattern.sub(lambda match: replacements[match.group(0)], text)
|
||||
text = replace_all(text, replacement_dict)
|
||||
#单独出现12月31日时,就剔除掉
|
||||
pattern_year = r'(?<!2023年|2022年|2021年)12月31日'
|
||||
pattern_year = r'(?<!2025年|2024年|2023年|2022年|2021年)12月31日'
|
||||
text = re.sub(pattern_year, '', text)
|
||||
|
||||
pattern = r"\([^)]*\)|\([^)]*\)" # 增加英文括号的匹配
|
||||
|
@ -137,11 +145,11 @@ def convert_docx_to_pdf(file_path):
|
|||
try:
|
||||
# 执行转换
|
||||
convert(file_path, pdf_path)
|
||||
print(f"转换成功: {pdf_path}")
|
||||
logger.info(f"转换成功: {pdf_path}")
|
||||
except Exception as e:
|
||||
print(f"转换失败: {e}")
|
||||
logger.error(f"转换失败: {e}")
|
||||
else:
|
||||
print("错误: 文件必须是 .docx 格式。")
|
||||
logger.error("错误: 文件必须是 .docx 格式。")
|
||||
|
||||
def save_pdf_from_url(url, file_path):
|
||||
from urllib.parse import unquote
|
||||
|
@ -163,10 +171,10 @@ def save_pdf_from_url(url, file_path):
|
|||
|
||||
with open(local_file_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
print(f"文件已下载到 {local_file_path}")
|
||||
logger.info(f"文件已下载到 {local_file_path}")
|
||||
else:
|
||||
# 文件下载失败
|
||||
print(f"无法下载文件,状态码:{response.status_code}")
|
||||
logger.error(f"无法下载文件,状态码:{response.status_code}")
|
||||
|
||||
return local_file_path
|
||||
|
||||
|
@ -252,7 +260,7 @@ def get_season_flag(text):
|
|||
return '0'
|
||||
|
||||
def get_percent_flag(text):
|
||||
percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|变动比例|本年度比上年度增减|增减'
|
||||
percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
|
||||
if len(re.findall(percent_word, text)) > 0:
|
||||
return '1'
|
||||
else:
|
||||
|
@ -293,40 +301,7 @@ def check_black_list(meta_measure, pdf_measure, black_array):
|
|||
|
||||
def check_black_list_old(meta_measure,pdf_measure):
|
||||
# 判断指标名是否包含黑名单词
|
||||
#black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
|
||||
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
|
||||
,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
|
||||
,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
|
||||
,'扣非净利润:净资产,净利率,年度公司'
|
||||
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
|
||||
,'非经常性损益:扣除非经常性损益'
|
||||
,'基本每股收益:稀释每股收益,发行新股'
|
||||
,'稀释每股收益:基本每股收益,发行新股'
|
||||
,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
|
||||
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
|
||||
,'应付账款:应收账款,年以上,内,至,到'
|
||||
,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
|
||||
,'研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入占比:金额,费用'
|
||||
,'研发投入占营业收入比例:金额,费用'
|
||||
,'上年年末:1月1日'
|
||||
,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
|
||||
,'期扣非加权平均净资产收益率:同比,年化,每股'
|
||||
,'加权平均净资产收益率同比变动:年化,每股'
|
||||
,'研发费用:制造,投入,直接,管理'
|
||||
,'应收账款:1-2年','货币资金:在途'
|
||||
,'当期:2023年1-6月,调整后'
|
||||
,'营业成本:营业总成本'
|
||||
,'长期借债:年内到期','研发投入:直接'
|
||||
,'第一季度:第二季度,第三季度,第四季度'
|
||||
,'第二季度:第一季度,第三季度,第四季度'
|
||||
,'第三季度:第二季度,第一季度,第四季度'
|
||||
,'第四季度:第二季度,第三季度,第一季度'
|
||||
,'研发费用:研发支出,研发投入','存货:跌价准备'
|
||||
,'费用:日常,付现','固定资产:改良,补助,投资']
|
||||
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
|
||||
# current_period = f'当期:{report_year}年1-6月'
|
||||
# black_array.append(current_period)
|
||||
for black in black_array:
|
||||
|
@ -550,26 +525,26 @@ def check_black_table_list(data):
|
|||
black_meta = black.split(':')[0]
|
||||
black_pdfs = black.split(':')[1].split(',')
|
||||
if any(black_meta in cell for row in data for cell in row):
|
||||
print(data)
|
||||
logger.debug(data)
|
||||
for pdf in black_pdfs:
|
||||
data = [row for row in data if not any(pdf in cell for cell in row)]
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print(len('我是我'))
|
||||
logger.debug(len('我是我'))
|
||||
|
||||
# print(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
|
||||
# logger.debug(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
|
||||
# title = '母公司财务报表主要项目注释'
|
||||
# if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
|
||||
# print('1')
|
||||
# logger.debug('1')
|
||||
# else:
|
||||
# print('0')
|
||||
# logger.debug('0')
|
||||
|
||||
# print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
|
||||
# logger.debug(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
|
||||
# test = '2023年1-12月'
|
||||
# print(get_period_type('上年度本期费用化研发投入'))
|
||||
# print(get_period_type('费用化研发投入本年度'))
|
||||
# logger.debug(get_period_type('上年度本期费用化研发投入'))
|
||||
# logger.debug(get_period_type('费用化研发投入本年度'))
|
||||
# vector_a = embed_with_str('第一季度营业收入')
|
||||
# vector = vector_a.output["embeddings"][0]["embedding"]
|
||||
|
||||
|
@ -577,7 +552,7 @@ if __name__ == '__main__':
|
|||
# vector1 = vector_b.output["embeddings"][0]["embedding"]
|
||||
|
||||
# similarity = cosine_similarity(vector, vector1)
|
||||
# print(f"余弦相似度: {similarity}")
|
||||
# logger.debug(f"余弦相似度: {similarity}")
|
||||
|
||||
# measure_data = [
|
||||
# '1,1,营业收入2023年金额,1003535799.51',
|
||||
|
@ -792,21 +767,14 @@ if __name__ == '__main__':
|
|||
# )
|
||||
# vector_obj = embed_with_str('2023年营业收入')
|
||||
# vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
# data = [vector]
|
||||
# res = client.search(
|
||||
# collection_name="zzb_measure", # Replace with the actual name of your collection
|
||||
# # Replace with your query vector
|
||||
# data=data,
|
||||
# limit=1, # Max. number of search results to return
|
||||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
# output_fields=["measure_name","measure_value"]
|
||||
# )
|
||||
|
||||
# # Convert the output to a formatted JSON string
|
||||
# result = json.dumps(res, indent=4, ensure_ascii=False)
|
||||
# print(result)
|
||||
# vector_b = embed_with_str('营业收入第一季度')
|
||||
# vector1 = vector_b.output["embeddings"][0]["embedding"]
|
||||
|
||||
# similarity = cosine_similarity(vector, vector1)
|
||||
# logger.debug(f"余弦相似度: {similarity}")
|
||||
|
||||
# insert_measure_data(client, measure_data)
|
||||
# text = '营业收入第一季度(1-3月份)'
|
||||
# new_text = re.sub(r'([^)]*)', '',text)
|
||||
# print(new_text)
|
||||
# logger.debug(new_text)
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
--2024-12-27 11:22:17-- https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
|
||||
Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
|
||||
wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
|
|
@ -0,0 +1,268 @@
|
|||
SELECT * from ori_measure_list where file_id=201928 and measure_name like "%营业收入%";
|
||||
|
||||
|
||||
SELECT * from ori_measure_list where file_id=39302 and measure_name like "%经营活动现金流%" order by page_number;
|
||||
SELECT * from ori_measure_list where file_id=201876 and measure_name like "%经营活动现金流%" order by page_number;
|
||||
|
||||
SELECT * from measure_list where file_id=201876;
|
||||
|
||||
SELECT * from word_measure_parse_process where file_id=201834;
|
||||
|
||||
SELECT * from pdf_parse_process where file_id=201844 and content like '%"table"%' ORDER BY page_num;
|
||||
|
||||
SELECT * from measure_parse_process where file_id=201844 and content like '%经营活动%';
|
||||
SELECT * from word_measure_parse_process where file_id=201876 and content like '%经营活动%'
|
||||
|
||||
SELECT * from measure_list where file_id=201844 and measure_name like "%经营活动现金流%";
|
||||
SELECT * from measure_list where file_id=201876 and measure_name like "%经营活动现金流%";
|
||||
##问题1 page——num 11 这个 word 没存
|
||||
|
||||
SELECT * from word_parse_process where file_id=201837 and content like '%"table"%' ORDER BY page_num;
|
||||
|
||||
DELETE from word_parse_process where file_id=201834 ;
|
||||
DELETE from ori_measure_list where file_id=201834 ;
|
||||
|
||||
SELECT * from report_check order by create_time desc ;
|
||||
|
||||
select count(*) from word_measure_parse_process where file_id = 593;
|
||||
|
||||
select count(*) from word_measure_parse_process where file_id = 201834;
|
||||
|
||||
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter
|
||||
where year = 2023
|
||||
|
||||
SELECT * from pdf_text_info where file_id=201844;
|
||||
|
||||
SELECT * FROM measure_black_list where isdel = 0 and find_in_set(3,flag) and find_in_set(2023,year) and measure_name like "%经营活动%";
|
||||
|
||||
SELECT * from measure_parser_info where file_id=201928;
|
||||
|
||||
SELECT * from word_measure_parse_process where file_id=201928;
|
||||
|
||||
|
||||
select * from word_parse_process WHERE file_id = '201928' order by page_num;
|
||||
|
||||
SELECT * from report_check WHERE id = "40925";
|
||||
|
||||
|
||||
select
|
||||
InnerCode
|
||||
,CompanyCode
|
||||
,LPAD(SecuCode,6,'0') as SecuCode
|
||||
,ChiName
|
||||
from secumain s
|
||||
where ListedState=1
|
||||
and ListedSector in (1,2,6,7,8)
|
||||
and SecuCategory in (1,41)
|
||||
and SecuMarket in (18,83,90)
|
||||
|
||||
SELECT * from third_measure_data where SecuCode like '688032' ;
|
||||
|
||||
set @year = "2023";
|
||||
set @enddate = "2023-6-30 00:00:00";
|
||||
set @InfoPublDate = '2024-9-30 00:00:00';
|
||||
-- 年报是0 半年报是1
|
||||
set @report_type = 1;
|
||||
|
||||
SELECT * from lc_stibmaindata WHERE CompanyCode = '19767000' and IfMerged='1' and EndDate=@enddate and InfoPublDate < @InfoPublDate;
|
||||
|
||||
SELECT * from lc_stibincomestate WHERE CompanyCode = '19767000' and IfMerged='1' and EndDate=@enddate and InfoPublDate < @InfoPublDate;
|
||||
|
||||
SELECT * from lc_stibbalancesheet WHERE CompanyCode = '19767000' and IfMerged='1' and EndDate=@enddate and InfoPublDate < @InfoPublDate;
|
||||
|
||||
drop table if exists third_quarter_external_data_tmp_pre;
|
||||
create table third_quarter_external_data_tmp_pre
|
||||
as
|
||||
select distinct @year as year
|
||||
,@report_type as report_type
|
||||
,t1.*
|
||||
,Infopubldate
|
||||
,InfoPublDate_end
|
||||
,OperatingReenue
|
||||
,NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
,TotalAssets
|
||||
,CashEquivalents
|
||||
,AccountReceivable
|
||||
,Inventories
|
||||
,TotalFixedAsset
|
||||
,TConstruInProcess
|
||||
,GoodWill
|
||||
,ShortTermLoan
|
||||
,AccountsPayable
|
||||
,ContractLiability
|
||||
,LongtermLoan
|
||||
,AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
select
|
||||
InnerCode
|
||||
,CompanyCode
|
||||
,LPAD(SecuCode,6,'0') as SecuCode
|
||||
,ChiName
|
||||
from secumain s
|
||||
where ListedState=1
|
||||
and ListedSector in (1,2,6,7,8)
|
||||
and SecuCategory in (1,41)
|
||||
and SecuMarket in (18,83,90)
|
||||
)t1
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,Infopubldate
|
||||
,OperatingReenue
|
||||
, null as NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibmaindata a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t2
|
||||
on t1.CompanyCode = t2.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoSourceCode as InfoSource,
|
||||
CompanyCode,
|
||||
TotalAssets
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibmaindata a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t5
|
||||
on t1.CompanyCode = t5.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoPublDate as InfoPublDate_end,
|
||||
InfoSourceCode as InfoSource,
|
||||
CompanyCode,
|
||||
CashEquivalents,
|
||||
AccountReceivable,
|
||||
Inventories,
|
||||
TotalFixedAsset,
|
||||
TConstruInProcess,
|
||||
GoodWill,
|
||||
ShortTermLoan,
|
||||
AccountsPayable,
|
||||
ContractLiability,
|
||||
LongtermLoan
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibbalancesheet a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t4
|
||||
on t1.CompanyCode = t4.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,EndDate
|
||||
,null as AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibincomestate a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t3
|
||||
on t1.CompanyCode = t3.CompanyCode
|
||||
;
|
||||
|
||||
SELECT * from third_quarter_external_data_tmp_pre where CompanyCode = '19767000';
|
||||
|
||||
|
||||
SELECT * from third_measure_data where SecuCode like '000937%' and year = '2021' and report_type = '0';
|
||||
|
||||
|
||||
SELECT * from lc_balancesheetall where CompanyCode = '171402' and IfMerged = '1';
|
||||
|
||||
SELECT * from lc_maindatanew where CompanyCode = '503' and EndDate like '%2021-12-31%' ;
|
||||
|
||||
SELECT * from secumain where SecuCode = '300432'
|
||||
|
||||
|
||||
|
||||
delete from third_measure_data where report_type
|
||||
|
||||
SELECT * from model_ip;
|
||||
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
Navicat Premium Data Transfer
|
||||
|
||||
Source Server : USM_financial_report@10.127.2.208_8f0a
|
||||
Source Server Type : MySQL
|
||||
Source Server Version : 80013
|
||||
Source Host : 10.124.45.100:63306
|
||||
Source Schema : financial_report_prod
|
||||
|
||||
Target Server Type : MySQL
|
||||
Target Server Version : 80013
|
||||
File Encoding : 65001
|
||||
|
||||
Date: 20/08/2025 22:20:40
|
||||
*/
|
||||
|
||||
SET NAMES utf8mb4;
|
||||
SET FOREIGN_KEY_CHECKS = 0;
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for third_measure_data_copy1
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `third_measure_data_copy1`;
|
||||
CREATE TABLE `third_measure_data_copy1` (
|
||||
`year` varchar(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
|
||||
`report_type` int(1) NOT NULL DEFAULT 0,
|
||||
`InnerCode` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`CompanyCode` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`SecuCode` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ChiName` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`Infopubldate` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`InfoPublDate_end` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingReenue` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NPFromParentCompanyOwners` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetProfitCut` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetOperateCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetFinanceCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetInvestCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NonRecurringProfitLoss` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`BasicEPS` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`DilutedEPS` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`WROE` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`WROECut` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TotalAssets` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`CashEquivalents` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountReceivable` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`Inventories` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TotalFixedAsset` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TConstruInProcess` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`GoodWill` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ShortTermLoan` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountsPayable` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ContractLiability` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`LongtermLoan` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountingStandards` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingCost` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AdministrationExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`FinancialExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`RAndD` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
|
||||
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
|
||||
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
|
@ -0,0 +1,226 @@
|
|||
set @year = "2024";
|
||||
SET @next_year = CAST(CAST(@year AS UNSIGNED) + 1 AS CHAR);
|
||||
set @enddate = CONCAT(@year,"" ,"-6-30 00:00:00");
|
||||
set @enddate_1 = CONCAT(@year,"" ,"-12-31 00:00:00");
|
||||
set @InfoPublDate = CONCAT(@next_year,"" ,"-9-30 00:00:00");
|
||||
-- 年报是0 半年报是1
|
||||
set @report_type = 1;
|
||||
|
||||
|
||||
drop table if exists third_quarter_external_data_tmp_pre;
|
||||
|
||||
|
||||
create table third_quarter_external_data_tmp_pre
|
||||
as
|
||||
select distinct @year as year
|
||||
,@report_type as report_type
|
||||
,t1.*
|
||||
,Infopubldate
|
||||
,InfoPublDate_end
|
||||
,OperatingReenue
|
||||
,NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
,TotalAssets
|
||||
,CashEquivalents
|
||||
,AccountReceivable
|
||||
,Inventories
|
||||
,TotalFixedAsset
|
||||
,TConstruInProcess
|
||||
,GoodWill
|
||||
,ShortTermLoan
|
||||
,AccountsPayable
|
||||
,ContractLiability
|
||||
,LongtermLoan
|
||||
,AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
select
|
||||
InnerCode
|
||||
,CompanyCode
|
||||
,LPAD(SecuCode,6,'0') as SecuCode
|
||||
,ChiName
|
||||
from secumain s
|
||||
where ListedState=1
|
||||
and ListedSector in (1,2,6,7,8)
|
||||
and SecuCategory in (1,41)
|
||||
and SecuMarket in (18,83,90)
|
||||
)t1
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,Infopubldate
|
||||
,OperatingReenue
|
||||
,NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_maindatanew a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
and mark NOT IN (4, 5)
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t2
|
||||
on t1.CompanyCode = t2.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoSource,
|
||||
CompanyCode
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_maindatanew a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
and mark NOT IN (4, 5)
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t5
|
||||
on t1.CompanyCode = t5.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode,
|
||||
TotalAssets
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_maindatanew a
|
||||
where EndDate=@enddate_1 and InfoPublDate < @InfoPublDate
|
||||
and mark NOT IN (4, 5)
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t6
|
||||
on t1.CompanyCode = t6.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoPublDate as InfoPublDate_end,
|
||||
InfoSource,
|
||||
CompanyCode,
|
||||
CashEquivalents,
|
||||
AccountReceivable,
|
||||
Inventories,
|
||||
TotalFixedAsset,
|
||||
TConstruInProcess,
|
||||
GoodWill,
|
||||
ShortTermLoan,
|
||||
AccountsPayable,
|
||||
ContractLiability,
|
||||
LongtermLoan
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_balancesheetall a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate_1 and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t4
|
||||
on t1.CompanyCode = t4.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,EndDate
|
||||
,AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM LC_IncomeStatementAll a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t3
|
||||
on t1.CompanyCode = t3.CompanyCode
|
||||
;
|
||||
|
||||
|
||||
|
||||
INSERT INTO third_measure_data SELECT * from third_quarter_external_data_tmp_pre;
|
||||
|
||||
select * from third_measure_data where year = 2024 and report_type = 1;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,271 @@
|
|||
set @year = "2024";
|
||||
SET @next_year = CAST(CAST(@year AS UNSIGNED) + 1 AS CHAR);
|
||||
set @enddate = CONCAT(@year,"" ,"-6-30 00:00:00");
|
||||
set @enddate_1 = CONCAT(@year,"" ,"-12-31 00:00:00");
|
||||
set @InfoPublDate = CONCAT(@next_year,"" ,"-9-30 00:00:00");
|
||||
-- 年报是0 半年报是1
|
||||
set @report_type = 1;
|
||||
|
||||
-- set @year = "2024";
|
||||
-- SET @next_year = CAST(CAST(@year AS UNSIGNED) + 2 AS CHAR);
|
||||
-- set @enddate = CONCAT(@year,"" ,"-12-31 00:00:00");
|
||||
-- set @enddate_1 = CONCAT(@year,"" ,"-12-31 00:00:00");
|
||||
-- set @InfoPublDate = CONCAT(@next_year,"" ,"-4-30 00:00:00");
|
||||
-- -- 年报是0 半年报是1
|
||||
-- set @report_type = 0;
|
||||
|
||||
|
||||
drop table if exists third_quarter_external_data_tmp_pre;
|
||||
|
||||
|
||||
-- 创建表结构
|
||||
CREATE TABLE `third_quarter_external_data_tmp_pre` (
|
||||
`year` varchar(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
|
||||
`report_type` int(1) NOT NULL DEFAULT 0,
|
||||
`InnerCode` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`CompanyCode` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`SecuCode` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ChiName` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`Infopubldate` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`InfoPublDate_end` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingReenue` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NPFromParentCompanyOwners` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetProfitCut` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetOperateCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetFinanceCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetInvestCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NonRecurringProfitLoss` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`BasicEPS` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`DilutedEPS` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`WROE` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`WROECut` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TotalAssets` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`CashEquivalents` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountReceivable` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`Inventories` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TotalFixedAsset` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TConstruInProcess` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`GoodWill` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ShortTermLoan` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountsPayable` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ContractLiability` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`LongtermLoan` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountingStandards` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingCost` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AdministrationExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`FinancialExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`RAndD` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
|
||||
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
|
||||
|
||||
|
||||
-- 插入数据
|
||||
insert into third_quarter_external_data_tmp_pre
|
||||
select distinct @year as year
|
||||
,@report_type as report_type
|
||||
,t1.*
|
||||
,Infopubldate
|
||||
,InfoPublDate_end
|
||||
,OperatingReenue
|
||||
,NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
,TotalAssets
|
||||
,CashEquivalents
|
||||
,AccountReceivable
|
||||
,Inventories
|
||||
,TotalFixedAsset
|
||||
,TConstruInProcess
|
||||
,GoodWill
|
||||
,ShortTermLoan
|
||||
,AccountsPayable
|
||||
,ContractLiability
|
||||
,LongtermLoan
|
||||
,AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
select
|
||||
InnerCode
|
||||
,CompanyCode
|
||||
,LPAD(SecuCode,6,'0') as SecuCode
|
||||
,ChiName
|
||||
from secumain s
|
||||
where ListedState=1
|
||||
and ListedSector in (1,2,6,7,8)
|
||||
and SecuCategory in (1,41)
|
||||
and SecuMarket in (18,83,90)
|
||||
)t1
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,Infopubldate
|
||||
,OperatingReenue
|
||||
,NPParComOwners as NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibmaindata a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=2
|
||||
)t2
|
||||
on t1.CompanyCode = t2.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoSourceCode as InfoSource,
|
||||
CompanyCode
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibmaindata a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=2
|
||||
)t5
|
||||
on t1.CompanyCode = t5.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode,
|
||||
TotalAssets
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibmaindata a
|
||||
where EndDate=@enddate_1 and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=2
|
||||
)t6
|
||||
on t1.CompanyCode = t6.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoPublDate as InfoPublDate_end,
|
||||
InfoSourceCode as InfoSource,
|
||||
CompanyCode,
|
||||
CashEquivalents,
|
||||
AccountReceivable,
|
||||
Inventories,
|
||||
TotalFixedAsset,
|
||||
TConstruInProcess,
|
||||
GoodWill,
|
||||
ShortTermLoan,
|
||||
AccountsPayable,
|
||||
ContractLiability,
|
||||
LongtermLoan
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibbalancesheet a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate_1 and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=2
|
||||
)t4
|
||||
on t1.CompanyCode = t4.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,EndDate
|
||||
,null as AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_stibincomestate a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=2
|
||||
)t3
|
||||
on t1.CompanyCode = t3.CompanyCode
|
||||
;
|
||||
|
||||
|
||||
|
||||
INSERT INTO third_measure_data SELECT * from third_quarter_external_data_tmp_pre;
|
||||
|
||||
|
||||
|
||||
SELECT * from third_measure_data where year = 2024 and report_type = 1;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,266 @@
|
|||
set @year = "2024";
|
||||
SET @next_year = CAST(CAST(@year AS UNSIGNED) + 1 AS CHAR);
|
||||
set @enddate = CONCAT(@year,"" ,"-6-30 00:00:00");
|
||||
set @enddate_1 = CONCAT(@year,"" ,"-12-31 00:00:00");
|
||||
set @InfoPublDate = CONCAT(@next_year,"" ,"-9-30 00:00:00");
|
||||
-- 年报是0 半年报是1
|
||||
set @report_type = 1;
|
||||
|
||||
|
||||
drop table if exists third_quarter_external_data_tmp_pre;
|
||||
|
||||
|
||||
-- 创建表结构
|
||||
CREATE TABLE `third_quarter_external_data_tmp_pre` (
|
||||
`year` varchar(4) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '',
|
||||
`report_type` int(1) NOT NULL DEFAULT 0,
|
||||
`InnerCode` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`CompanyCode` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`SecuCode` varchar(6) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ChiName` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`Infopubldate` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`InfoPublDate_end` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingReenue` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NPFromParentCompanyOwners` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetProfitCut` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetOperateCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetFinanceCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NetInvestCashFlow` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`NonRecurringProfitLoss` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`BasicEPS` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`DilutedEPS` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`WROE` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`WROECut` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TotalAssets` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`CashEquivalents` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountReceivable` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`Inventories` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TotalFixedAsset` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`TConstruInProcess` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`GoodWill` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ShortTermLoan` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountsPayable` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`ContractLiability` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`LongtermLoan` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AccountingStandards` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`OperatingCost` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`AdministrationExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`FinancialExpense` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
|
||||
`RAndD` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
|
||||
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
|
||||
|
||||
-- 插入数据
|
||||
insert into third_quarter_external_data_tmp_pre
|
||||
select distinct @year as year
|
||||
,@report_type as report_type
|
||||
,t1.*
|
||||
,Infopubldate
|
||||
,InfoPublDate_end
|
||||
,OperatingReenue
|
||||
,NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
,TotalAssets
|
||||
,CashEquivalents
|
||||
,AccountReceivable
|
||||
,Inventories
|
||||
,TotalFixedAsset
|
||||
,TConstruInProcess
|
||||
,GoodWill
|
||||
,ShortTermLoan
|
||||
,AccountsPayable
|
||||
,ContractLiability
|
||||
,LongtermLoan
|
||||
,AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
select
|
||||
InnerCode
|
||||
,CompanyCode
|
||||
,LPAD(SecuCode,6,'0') as SecuCode
|
||||
,ChiName
|
||||
from secumain s
|
||||
where ListedState=1
|
||||
and ListedSector in (1,2,6,7,8)
|
||||
and SecuCategory in (1,41)
|
||||
and SecuMarket in (18,83,90)
|
||||
)t1
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,Infopubldate
|
||||
,OperatingReenue
|
||||
,NPFromParentCompanyOwners
|
||||
,NetProfitCut
|
||||
,NetOperateCashFlow
|
||||
,NetFinanceCashFlow
|
||||
,NetInvestCashFlow
|
||||
,NonRecurringProfitLoss
|
||||
,BasicEPS
|
||||
,DilutedEPS
|
||||
,WROE
|
||||
,WROECut
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_maindatanew a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
and mark NOT IN (4, 5)
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t2
|
||||
on t1.CompanyCode = t2.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoSource,
|
||||
CompanyCode
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_maindatanew a
|
||||
where EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
and mark NOT IN (4, 5)
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t5
|
||||
on t1.CompanyCode = t5.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode,
|
||||
TotalAssets
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_maindatanew a
|
||||
where EndDate=@enddate_1 and InfoPublDate < @InfoPublDate
|
||||
and mark NOT IN (4, 5)
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t6
|
||||
on t1.CompanyCode = t6.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
EndDate,
|
||||
InfoPublDate as InfoPublDate_end,
|
||||
InfoSource,
|
||||
CompanyCode,
|
||||
CashEquivalents,
|
||||
AccountReceivable,
|
||||
Inventories,
|
||||
TotalFixedAsset,
|
||||
TConstruInProcess,
|
||||
GoodWill,
|
||||
ShortTermLoan,
|
||||
AccountsPayable,
|
||||
ContractLiability,
|
||||
LongtermLoan
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM lc_balancesheetall a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate_1 and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t4
|
||||
on t1.CompanyCode = t4.CompanyCode
|
||||
left join
|
||||
(
|
||||
select
|
||||
CompanyCode
|
||||
,EndDate
|
||||
,AccountingStandards
|
||||
,OperatingExpense
|
||||
,OperatingCost
|
||||
,AdministrationExpense
|
||||
,FinancialExpense
|
||||
,RAndD
|
||||
from
|
||||
(
|
||||
SELECT *
|
||||
FROM (SELECT b.*
|
||||
,@rownum := @rownum+1
|
||||
,IF(@pdept = b.CompanyCode, @rank := @rank + 1, @rank := 1) AS rank_num
|
||||
,@pdept := b.CompanyCode
|
||||
FROM (SELECT * FROM LC_IncomeStatementAll a
|
||||
where IfMerged =1
|
||||
and EndDate=@enddate and InfoPublDate < @InfoPublDate
|
||||
ORDER BY a.CompanyCode, a.InfoPublDate DESC) b
|
||||
-- 初始化自定义变量值
|
||||
,(SELECT @rownum :=0, @pdept := NULL, @rank:=0) c
|
||||
-- 该排序必须,否则结果会不对
|
||||
ORDER BY b.CompanyCode, b.InfoPublDate DESC) result
|
||||
order by CompanyCode, rank_num
|
||||
)t where t.rank_num=1
|
||||
)t3
|
||||
on t1.CompanyCode = t3.CompanyCode
|
||||
;
|
||||
|
||||
|
||||
|
||||
INSERT INTO third_measure_data SELECT * from third_quarter_external_data_tmp_pre;
|
||||
|
||||
select * from third_measure_data where year = 2024 and report_type = 1;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
# 新增指标数据
|
||||
|
||||
delete FROM measure_config_half_year where year = 2025 ;
|
||||
# 1 先复制上一年的所有指标数据
|
||||
|
||||
INSERT INTO measure_config_half_year
|
||||
SELECT
|
||||
-- 列出所有其他字段保持不变
|
||||
measure_id,
|
||||
measure_name,
|
||||
ori_measure_id,
|
||||
ori_measure_name,
|
||||
delete_status,
|
||||
measure_vector,
|
||||
distance,
|
||||
-- 仅修改year字段为2025
|
||||
2025 AS year
|
||||
FROM measure_config_half_year
|
||||
WHERE year = 2024;
|
||||
|
||||
|
||||
# 先修改 ori_measure_name 中上一年2024的 改为2025
|
||||
UPDATE measure_config_half_year
|
||||
SET ori_measure_name = REPLACE(ori_measure_name, '2024', '2025')
|
||||
WHERE ori_measure_name LIKE '%2024%' and `year` = 2025;
|
||||
|
||||
|
||||
# 先修改 ori_measure_name 中上一年2023的 改为2024
|
||||
UPDATE measure_config_half_year
|
||||
SET ori_measure_name = REPLACE(ori_measure_name, '2023', '2024')
|
||||
WHERE ori_measure_name LIKE '%2023%' and `year` = 2025;
|
||||
|
||||
SELECT * from measure_config_half_year where `year` = 2025 and ori_measure_name LIKE '%2025%';
|
|
@ -0,0 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -27,43 +27,4 @@ def create_partition_by_hour(current_hour):
|
|||
pre_partition = collection.partition(name)
|
||||
pre_partition.release()
|
||||
collection.drop_partition(name)
|
||||
print(f"Partition '{name}' deleted.")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
|
||||
# 连接到 B 服务器上的 Milvus
|
||||
# connections.connect(host='124.70.129.232', port='19530')# 测试服务器
|
||||
connections.connect(host='127.0.0.1', port='19530')# 测试服务器
|
||||
# # 获取集合列表
|
||||
utility.drop_collection("pdf_measure_v4")
|
||||
|
||||
# 定义字段
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
|
||||
FieldSchema(name="table_num", dtype=DataType.INT16),
|
||||
FieldSchema(name="table_index", dtype=DataType.INT16),
|
||||
FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
|
||||
]
|
||||
|
||||
# 定义集合的 schema
|
||||
schema = CollectionSchema(fields=fields, description="My Milvus collection")
|
||||
|
||||
# 创建集合
|
||||
collection = Collection(name="pdf_measure_v4", schema=schema)
|
||||
|
||||
collection = Collection("pdf_measure_v4")
|
||||
index_params = {
|
||||
"index_type": "IVF_FLAT",
|
||||
"metric_type": "COSINE",
|
||||
"params": {"nlist": 128}
|
||||
}
|
||||
collection.create_index(field_name="vector", index_params=index_params)
|
||||
collection.load()
|
||||
print(f"Partition '{name}' deleted.")
|
|
@ -0,0 +1,5 @@
|
|||
nohup: ignoring input
|
||||
INFO: Started server process [1654611]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
|
|
@ -353,6 +353,18 @@ app.post("/parser/disclosure",
|
|||
|
||||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
# 获取内网IP
|
||||
ip = get_local_ip()
|
||||
logger.info(f"内网IP地址: {ip}")
|
||||
# 假设 config.NOTIFY_ADDR 是一个字符串,我们可以使用 rpartition 方法来替换最后一个 / 后面的值
|
||||
url = config.NOTIFY_ADDR.rpartition('/')[0] + '/restart?address'
|
||||
address = f"{ip}:{config.PORT}"
|
||||
logger.info(address)
|
||||
response = requests.get(url, params={'address':address})
|
||||
logger.info(f"Response status code: {response.status_code}")
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutdown server")
|
||||
# 服务器启动服务
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
|
|
|
@ -3,7 +3,6 @@ from pydantic import BaseModel
|
|||
import os
|
||||
import utils
|
||||
import queue
|
||||
import multiprocessing
|
||||
from multiprocessing import Process
|
||||
import word_title
|
||||
import time
|
||||
|
@ -55,7 +54,7 @@ def run_job():
|
|||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
|
||||
applog.info(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
applog.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
parsed_content, catalog_content = parse_docx(file_path) # catalog_content 目录需要写入数据库
|
||||
parsed_content, catalog_content = parse_docx(file_path)
|
||||
|
||||
json_parsed_content = json.loads(parsed_content)
|
||||
json_catalog_content = json.loads(catalog_content)
|
||||
|
@ -85,7 +84,7 @@ def run_job():
|
|||
p = Process(target=main_word.process_table, args=(file_id, job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
applog.info(f'等待所有子任务完成,任务ID:{file_id}')
|
||||
applog.info(f'等待所有子任务完成,任务ID:{file_id}' )
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
|
@ -213,14 +212,14 @@ app.post("/parser/start",
|
|||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
# 服务器启动服务
|
||||
# import uvicorn
|
||||
#
|
||||
# uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
# 本地调试任务
|
||||
file_id = "201917"
|
||||
job_queue.put({
|
||||
'file_path': '1.docx',
|
||||
'file_id': file_id,
|
||||
})
|
||||
db_service_word.delete_database(file_id)
|
||||
run_job()
|
||||
# file_id = "201837"
|
||||
# job_queue.put({
|
||||
# 'file_path': '西部建设.docx',
|
||||
# 'file_id': file_id,
|
||||
# })
|
||||
# db_service_word.delete_database(file_id)
|
||||
# run_job()
|
||||
|
|
|
@ -1,33 +1,28 @@
|
|||
MILVUS_CLIENT='http://124.70.129.232:19530'
|
||||
#MILVUS_CLIENT='http://60.204.228.154:19530'
|
||||
MYSQL_HOST = '121.37.185.246'
|
||||
MILVUS_CLIENT='http://127.0.0.1:19530'
|
||||
MILVUS_HOST = '127.0.0.1'
|
||||
MILVUS_PORT = 19530
|
||||
MYSQL_HOST = '10.127.2.207'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial'
|
||||
MYSQL_PASSWORD = 'financial_8000'
|
||||
MYSQL_DB = 'financial_report'
|
||||
MYSQL_USER = 'financial_prod'
|
||||
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB = 'financial_report_test'
|
||||
NOTIFY_ADDR = 'http://10.127.2.206:8101/api/tenant/report/notify'
|
||||
FILE_PATH = '/root/pdf_parser/word/'
|
||||
|
||||
# NOTIFY_ADDR = 'http://192.168.0.175:8100/api/tenant/report/notify'
|
||||
|
||||
|
||||
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
|
||||
|
||||
# REDIS_HOST = '127.0.0.1'
|
||||
REDIS_HOST = '123.60.153.169'
|
||||
REDIS_HOST = '10.127.2.206'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
FILE_PATH = '/root/word_parser/word/'
|
||||
PORT = 8001
|
||||
MEASURE_COUNT = 8
|
||||
|
||||
# MYSQL_HOST_APP = '192.168.0.201'#192.168.0.201
|
||||
# MYSQL_PORT_APP = 3306
|
||||
# MYSQL_USER_APP = 'root'
|
||||
# MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
# MYSQL_DB_APP = 'financial_report_prod'
|
||||
|
||||
|
||||
MYSQL_HOST_APP = '121.37.185.246'#192.168.0.201
|
||||
MYSQL_HOST_APP = '10.127.2.207'
|
||||
MYSQL_PORT_APP = 3306
|
||||
MYSQL_USER_APP = 'financial'
|
||||
MYSQL_PASSWORD_APP = 'financial_8000'
|
||||
MYSQL_DB_APP = 'financial_report'
|
||||
MYSQL_USER_APP = 'financial_prod'
|
||||
MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB_APP = 'financial_report_test'
|
||||
api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -208,6 +208,17 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
|
||||
select_query_first_quarter = '''
|
||||
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
|
||||
FROM ori_measure_list t1
|
||||
left join
|
||||
measure_config_first_quarter t2
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
|
||||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
|
||||
if report_type == 1:
|
||||
start_time = time.time()
|
||||
|
@ -216,6 +227,13 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
end_time = time.time()
|
||||
applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
applog.info(f'update_ori_measure方法走的是半年报')
|
||||
elif report_type == 2:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first_quarter)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
applog.info(f'update_ori_measure方法走的是一季报')
|
||||
elif report_type == 3:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
|
@ -243,6 +261,9 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
|
||||
if report_type == 0:
|
||||
table_name = "measure_config"
|
||||
elif report_type == 2:
|
||||
table_name = "measure_config_first_quarter"
|
||||
|
||||
elif report_type == 3:
|
||||
table_name = "measure_config_third_quarter"
|
||||
else:
|
||||
|
@ -342,7 +363,14 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
measure_index_records = cursor_app.fetchall()
|
||||
for measure_index_record in measure_index_records:
|
||||
measure_index_array.append(measure_index_record[0])
|
||||
|
||||
|
||||
if str(report_type) == "2":
|
||||
parent_table_pages = []
|
||||
table_index_array = []
|
||||
measure_index_array = []
|
||||
applog.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
|
||||
applog.info(f"black_array:{black_array}")
|
||||
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
|
@ -368,6 +396,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
output_fields=["measure_name","measure_value","table_num","table_index","measure_unit"],
|
||||
filter=filter_str
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Convert the output to a formatted JSON string
|
||||
# for i in range(len(res[0])):
|
||||
|
@ -387,16 +417,18 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
#过滤表格上方文字黑名单关键词的页码和表格下标
|
||||
if f"{table_num}" in table_index_array:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
#过滤指标中包含黑名单关键词
|
||||
if utils.check_pdf_measure_black_list(pdf_measure):
|
||||
continue
|
||||
|
||||
if f"{table_num}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
#if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
applog.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
|
||||
continue
|
||||
|
||||
|
||||
|
||||
if vector_distance > distance and table_num not in parent_table_pages:
|
||||
#检测规则开始
|
||||
#判断抽取指标和财报指标周期是否相同
|
||||
|
@ -406,7 +438,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
applog.info(f'第1处{ori_period}和{pdf_period}')
|
||||
if(ori_period != pdf_period):
|
||||
continue
|
||||
|
||||
|
||||
|
||||
#判断抽取指标和财报指标是否期初指标
|
||||
start_ori_period = utils.get_start_period_type(ori_measure_name)
|
||||
start_pdf_period = utils.get_start_period_type(pdf_measure)
|
||||
|
@ -422,6 +455,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
applog.info(f'第3处{ori_season_type}和{pdf_season_type}')
|
||||
if(ori_season_type != pdf_season_type):
|
||||
continue
|
||||
|
||||
|
||||
#判断是否都是扣非指标
|
||||
ori_kf_type = utils.get_kf_flag(ori_measure_name)
|
||||
|
@ -429,8 +463,9 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
applog.info(f'第4处{ori_kf_type}和{pdf_kf_type}')
|
||||
if(ori_kf_type != pdf_kf_type):
|
||||
applog.info(f'扣非指标{table_num}页的{pdf_measure}指标')
|
||||
continue
|
||||
|
||||
|
||||
#判断抽取指标和财报指标类型是否相同,是否都是百分比
|
||||
ori_type = utils.get_percent_flag(ori_measure_name)
|
||||
pdf_type = utils.get_percent_flag(pdf_measure)
|
||||
|
@ -459,12 +494,13 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
check_records = cursor.fetchall()
|
||||
if(len(check_records)) > 0:
|
||||
continue
|
||||
|
||||
|
||||
#判断是否包含黑名单
|
||||
if(utils.check_black_list(measure_name,pdf_measure,black_array)):
|
||||
continue
|
||||
|
||||
|
||||
if(utils.check_white_list(measure_name,pdf_measure)):
|
||||
applog.info(f"measure_name{measure_name},pdf_measure{pdf_measure}")
|
||||
continue
|
||||
|
||||
#判断抽取指标和财报指标类型是否都是增长类,比如同比变动为增长类
|
||||
|
@ -483,7 +519,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
|
||||
if(ori_report_start != pdf_report_start):
|
||||
continue
|
||||
|
||||
|
||||
#检测规则结束
|
||||
#获取指标单位数据,除了百分比
|
||||
if(utils.get_percent_flag(measure_name) == '0'):
|
||||
|
@ -496,7 +532,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
unit = unit_records[0][0]
|
||||
else:
|
||||
unit = '元'
|
||||
|
||||
|
||||
data_to_insert = (file_id, file_name, "table", int(table_num), int(table_index), ori_measure_id, ori_measure_name, measure_value, create_time, create_time, vector_distance, pdf_measure,measure_id,measure_name,unit)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
@ -508,6 +544,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
conn.close()
|
||||
client.close()
|
||||
|
||||
#
|
||||
def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_name):
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
|
@ -527,10 +564,16 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
select_query_first_quarter = '''
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
# select_black_array_query = 'SELECT measure_name, keywords FROM measure_black_list where isdel = 0'
|
||||
select_black_array_query = '''
|
||||
SELECT measure_name, keywords FROM measure_black_list where isdel = 0 and find_in_set('{year}',year) and find_in_set('{flag}',flag)
|
||||
'''.format(year=report_year, flag=report_type)
|
||||
|
||||
|
||||
black_array = []
|
||||
cursor.execute(select_black_array_query)
|
||||
results = cursor.fetchall()
|
||||
|
@ -553,6 +596,20 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 2:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first_quarter)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
applog.info('insert_table_measure_from_vector_async_process方法走的一季报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 3:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
|
@ -698,11 +755,15 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
|
|||
measure_list = table['measure_list']
|
||||
for measure in measure_list:
|
||||
measure_name = measure['measure_name']
|
||||
# 需要跳过的一些指标
|
||||
black_list = ["营业总成本"]
|
||||
if any(black in measure_name for black in black_list):
|
||||
continue
|
||||
measure_value = measure['measure_value'].replace("(", "").replace(")", "")
|
||||
measure_name = utils.get_clean_text(measure_name)
|
||||
measure_name = measure_name.replace('2024','2024年').replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉
|
||||
measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉
|
||||
#measure_name_1 = measure_name.replace('调整后','')
|
||||
quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2024年','2023年','2022年','2021年','年']
|
||||
quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2023年','2022年','2021年','年']
|
||||
for quarter in quarters:
|
||||
measure_name = measure_name.replace(quarter * 2, quarter)
|
||||
pattern_dup = re.compile(r'(\w{3,})\1+')#去掉任意超过两个字且重复的字符
|
||||
|
@ -712,7 +773,6 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
|
|||
measure_name = pattern_dup.sub(r'\1', measure_name)
|
||||
measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
|
||||
measure_unit = measure['measure_unit']
|
||||
|
||||
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
|
||||
vector_obj = utils.embed_with_str(measure_name_1)
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
|
@ -822,7 +882,6 @@ def delete_database(file_id):
|
|||
"delete from measure_list where file_id = %s;",
|
||||
"delete from word_parse_process where file_id = %s;",
|
||||
"delete from table_unit_info where file_id = %s;",
|
||||
"delete from word_measure_parse_process where file_id = %s;",
|
||||
# "delete from a where file_id = %s;",
|
||||
# "delete from b where file_id = %s;",
|
||||
]
|
||||
|
@ -898,23 +957,15 @@ def batch_insert_page_text(table_info, conn, cursor, table_name):
|
|||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
text_lines = table_info['text']
|
||||
|
||||
# 1. 检查表是否为空
|
||||
check_if_empty_query = f"SELECT COUNT(*) FROM {table_name} where file_id = {file_id} and page_num = {page_num}"
|
||||
cursor.execute(check_if_empty_query)
|
||||
is_table_empty = cursor.fetchone()[0] == 0
|
||||
|
||||
if is_table_empty:
|
||||
# 表为空,直接插入数据
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(file_id, page_num, text)
|
||||
VALUES (%s, %s, %s)
|
||||
'''
|
||||
data_to_insert = [(file_id, page_num, text_lines) ]
|
||||
cursor.executemany(insert_query, data_to_insert)
|
||||
else:
|
||||
pass
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(file_id, page_num, text)
|
||||
VALUES (%s, %s, %s)
|
||||
'''
|
||||
data_to_insert = [(file_id, page_num, text_lines) ]
|
||||
cursor.executemany(insert_query, data_to_insert)
|
||||
|
||||
conn.commit()
|
||||
def file_type_check(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
|
|
|
@ -0,0 +1,201 @@
|
|||
import pymssql
|
||||
import mysql.connector
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# SQL Server配置
|
||||
sql_server_config = {
|
||||
"server": "203.192.15.17",
|
||||
"port": 28063,
|
||||
"user": "zncbuser",
|
||||
"password": "ZZB-Cbindex-data",
|
||||
"database": "jydb",
|
||||
}
|
||||
|
||||
# MySQL配置
|
||||
mysql_config = {
|
||||
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com",
|
||||
"user": "zzb_jydb",
|
||||
"password": "Ysdbsdjs89Yrqwp",
|
||||
"database": "zzb_jydb",
|
||||
}
|
||||
|
||||
def sync_table(table_name):
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(**sql_server_config)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 连接到MySQL
|
||||
mysql_conn = mysql.connector.connect(**mysql_config)
|
||||
mysql_cursor = mysql_conn.cursor()
|
||||
|
||||
logging.info(f"Processing table: {table_name}")
|
||||
|
||||
# 检查MySQL中是否已存在该表
|
||||
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
|
||||
table_exists = mysql_cursor.fetchone()
|
||||
|
||||
# 获取表的列信息
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT
|
||||
COLUMN_NAME,
|
||||
DATA_TYPE,
|
||||
CHARACTER_MAXIMUM_LENGTH,
|
||||
NUMERIC_PRECISION,
|
||||
NUMERIC_SCALE
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_NAME = '{table_name}'
|
||||
""")
|
||||
columns = sql_server_cursor.fetchall()
|
||||
|
||||
# 检查是否存在 XGRQ 或 UpdateTime 字段
|
||||
update_time_fields = ['xgrq', 'updatetime'] # 可能的字段名
|
||||
update_time_field = None
|
||||
for col in columns:
|
||||
if col[0].lower() in update_time_fields:
|
||||
update_time_field = col[0] # 找到第一个匹配的字段
|
||||
break
|
||||
|
||||
logging.info(f"Table {table_name} has update time field: {update_time_field}")
|
||||
|
||||
if not table_exists:
|
||||
# 如果表不存在,创建表
|
||||
create_table_sql = f"CREATE TABLE {table_name} ("
|
||||
for col in columns:
|
||||
col_name = col[0]
|
||||
col_type = col[1]
|
||||
# 类型映射逻辑(略)
|
||||
create_table_sql += f"`{col_name}` {col_type}, "
|
||||
create_table_sql = create_table_sql.rstrip(", ") + ")"
|
||||
logging.info(f"Create table SQL: {create_table_sql}")
|
||||
|
||||
# 在MySQL中创建表
|
||||
mysql_cursor.execute(create_table_sql)
|
||||
logging.info(f"Table {table_name} created in MySQL.")
|
||||
else:
|
||||
logging.info(f"Table {table_name} already exists in MySQL. Updating data...")
|
||||
|
||||
# 获取SQL Server中的所有id
|
||||
sql_server_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
|
||||
sql_server_ids = {row[0] for row in sql_server_cursor.fetchall()}
|
||||
|
||||
# 获取MySQL中的所有id
|
||||
mysql_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
|
||||
mysql_ids = {row[0] for row in mysql_cursor.fetchall()}
|
||||
|
||||
# 找出需要插入的id
|
||||
ids_to_insert = sql_server_ids - mysql_ids
|
||||
logging.info(f"Found {len(ids_to_insert)} new rows to insert.")
|
||||
|
||||
# 分批插入数据
|
||||
batch_size = 10000 # 每批次处理的行数
|
||||
id_list = list(ids_to_insert)
|
||||
for i in range(0, len(id_list), batch_size):
|
||||
batch_ids = id_list[i:i + batch_size]
|
||||
|
||||
# 从SQL Server中查询需要插入的数据
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT * FROM {table_name}
|
||||
WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
|
||||
""")
|
||||
rows_to_insert = sql_server_cursor.fetchall()
|
||||
|
||||
# 插入数据到MySQL
|
||||
if rows_to_insert:
|
||||
insert_sql = f"INSERT INTO {table_name} ({', '.join([f'`{col[0]}`' for col in columns])}) VALUES ({', '.join(['%s'] * len(columns))})"
|
||||
mysql_cursor.executemany(insert_sql, rows_to_insert)
|
||||
mysql_conn.commit()
|
||||
logging.info(f"Inserted {len(rows_to_insert)} rows into {table_name}.")
|
||||
|
||||
# 如果存在更新字段(XGRQ 或 UpdateTime),检查是否需要更新
|
||||
if update_time_field:
|
||||
logging.info(f"Checking for updates based on {update_time_field} field in table: {table_name}")
|
||||
|
||||
# 获取SQL Server中的id和更新字段的值,且更新字段大于2023年
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
|
||||
WHERE {update_time_field} > '2023-11-12 20:23:23'
|
||||
""")
|
||||
sql_server_update_data = {row[0]: row[1] for row in sql_server_cursor.fetchall()}
|
||||
|
||||
# 获取MySQL中的id和更新字段的值
|
||||
mysql_cursor.execute(f"""
|
||||
SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
|
||||
""")
|
||||
mysql_update_data = {row[0]: row[1] for row in mysql_cursor.fetchall()}
|
||||
|
||||
# 找出需要更新的id
|
||||
ids_to_update = []
|
||||
for id, sql_server_update_time in sql_server_update_data.items():
|
||||
if id in mysql_update_data and sql_server_update_time != mysql_update_data[id]:
|
||||
ids_to_update.append(id)
|
||||
|
||||
logging.info(f"Found {len(ids_to_update)} rows to update.")
|
||||
|
||||
# 分批更新数据
|
||||
for i in range(0, len(ids_to_update), batch_size):
|
||||
batch_ids = ids_to_update[i:i + batch_size]
|
||||
|
||||
# 从SQL Server中查询需要更新的数据,且更新字段大于2023年
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT * FROM {table_name}
|
||||
WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
|
||||
AND {update_time_field} > '2023-11-12 20:23:23'
|
||||
""")
|
||||
rows_to_update = sql_server_cursor.fetchall()
|
||||
|
||||
# 更新数据到MySQL
|
||||
if rows_to_update:
|
||||
update_sql = f"UPDATE {table_name} SET "
|
||||
update_sql += ", ".join([f"`{col[0]}` = %s" for col in columns[1:]]) # 跳过id列
|
||||
update_sql += f" WHERE `{columns[0][0]}` = %s"
|
||||
update_values = [list(row[1:]) + [row[0]] for row in rows_to_update] # 跳过id列
|
||||
mysql_cursor.executemany(update_sql, update_values)
|
||||
mysql_conn.commit()
|
||||
logging.info(f"Updated {len(rows_to_update)} rows in table {table_name}.")
|
||||
|
||||
logging.info(f"Sync completed for table: {table_name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to sync table {table_name}. Error: {e}")
|
||||
finally:
|
||||
# 关闭连接
|
||||
if 'sql_server_cursor' in locals():
|
||||
sql_server_cursor.close()
|
||||
if 'sql_server_conn' in locals():
|
||||
sql_server_conn.close()
|
||||
if 'mysql_cursor' in locals():
|
||||
mysql_cursor.close()
|
||||
if 'mysql_conn' in locals():
|
||||
mysql_conn.close()
|
||||
|
||||
def main():
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(**sql_server_config)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 获取SQL Server中的所有表
|
||||
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
|
||||
tables = sql_server_cursor.fetchall()
|
||||
|
||||
# 处理每个表
|
||||
for table in tables:
|
||||
if table[0].lower() == "lc_mainshlistnew":
|
||||
sync_table(table[0])
|
||||
|
||||
logging.info("All tables synced successfully!")
|
||||
except Exception as e:
|
||||
logging.error(f"Main function failed. Error: {e}")
|
||||
finally:
|
||||
# 关闭连接
|
||||
if 'sql_server_cursor' in locals():
|
||||
sql_server_cursor.close()
|
||||
if 'sql_server_conn' in locals():
|
||||
sql_server_conn.close()
|
||||
|
||||
# 启动主函数
|
||||
if __name__ == "__main__":
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
|
@ -1,204 +0,0 @@
|
|||
2024-12-29 16:13:29,975|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:13:36,106|zzb_logger : INFO 任务 201917 完成
|
||||
2024-12-29 16:15:16,205|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:15:22,356|zzb_logger : INFO 任务 201917 完成
|
||||
2024-12-29 16:17:15,693|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
|
||||
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:17:25,319|zzb_logger : INFO text,任务ID:201917
|
||||
2024-12-29 16:17:26,701|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (5116)...
|
||||
2024-12-29 16:17:28,173|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (22268)...
|
||||
2024-12-29 16:17:29,591|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (27736)...
|
||||
2024-12-29 16:17:30,937|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38276)...
|
||||
2024-12-29 16:17:32,294|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38292)...
|
||||
2024-12-29 16:17:33,664|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38240)...
|
||||
2024-12-29 16:17:35,153|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28536)...
|
||||
2024-12-29 16:17:36,559|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37552)...
|
||||
2024-12-29 16:17:37,929|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37856)...
|
||||
2024-12-29 16:17:39,291|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (10528)...
|
||||
2024-12-29 16:17:40,688|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (31444)...
|
||||
2024-12-29 16:17:42,133|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (11108)...
|
||||
2024-12-29 16:17:43,518|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23236)...
|
||||
2024-12-29 16:17:44,901|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23572)...
|
||||
2024-12-29 16:17:46,495|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (39604)...
|
||||
2024-12-29 16:17:47,899|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (4076)...
|
||||
2024-12-29 16:17:47,899|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:18:02,194|zzb_logger : INFO word表格中 text解析完成,任务ID:201917
|
||||
2024-12-29 16:18:02,196|zzb_logger : INFO 开始解析word表表格中的table,任务ID:201917
|
||||
2024-12-29 16:18:03,525|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36176)...
|
||||
2024-12-29 16:18:04,585|zzb_logger : INFO Task 解析表格201917 runs 1.06 seconds.
|
||||
2024-12-29 16:18:04,873|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35368)...
|
||||
2024-12-29 16:18:05,769|zzb_logger : INFO Task 解析表格201917 runs 0.90 seconds.
|
||||
2024-12-29 16:18:06,263|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (33004)...
|
||||
2024-12-29 16:18:07,225|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
|
||||
2024-12-29 16:18:07,628|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (30764)...
|
||||
2024-12-29 16:18:08,427|zzb_logger : INFO Task 解析表格201917 runs 0.80 seconds.
|
||||
2024-12-29 16:18:08,976|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (29608)...
|
||||
2024-12-29 16:18:09,864|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:10,588|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (5404)...
|
||||
2024-12-29 16:18:11,360|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
|
||||
2024-12-29 16:18:11,966|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36200)...
|
||||
2024-12-29 16:18:12,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36328)...
|
||||
2024-12-29 16:18:12,892|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
|
||||
2024-12-29 16:18:13,034|zzb_logger : INFO Task 解析表格201917 runs 1.00 seconds.
|
||||
2024-12-29 16:18:13,392|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39712)...
|
||||
2024-12-29 16:18:14,166|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
|
||||
2024-12-29 16:18:15,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17184)...
|
||||
2024-12-29 16:18:15,084|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38828)...
|
||||
2024-12-29 16:18:15,156|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39596)...
|
||||
2024-12-29 16:18:15,194|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36908)...
|
||||
2024-12-29 16:18:15,268|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38088)...
|
||||
2024-12-29 16:18:15,273|zzb_logger : INFO 解析表格时出现了异常 setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part. 内容为{'type': 'table', 'index': 1438, 'data': [['项目', '期末', '期末', '期末', '期末', '期末', '期初', '期初', '期初', '期初', '期初', '期初', '期初', '期初'], ['', '账面余额', '账面价值', '受限类型', '受限情况', '受限情况', '账面余额', '账面余额', '账面价值', '账面价值', '受限类型', '受限类型', '受限情况', ''], ['货币资金', '485,532.72', '485,532.72', '', '住房专用基金', '住房专用基金', '482,151.75', '482,151.75', '482,151.75', '482,151.75', '', '', '住房专用基金', ''], ['固定资产', '9,798,299.46', '9,798,299.46', '', '金融机构借款抵押', '3,747,470.09', '3,747,470.09', '3,747,470.09', '3,747,470.09', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['无形资产', '7,982,261.87', '7,982,261.87', '', '金融机构借款抵押', '5,437,462.92', '5,437,462.92', '5,437,462.92', '5,437,462.92', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['货币资金', '43,997,452.57', '43,997,452.57', '', '银行保证金', '63,388,483.00', '63,388,483.00', '63,388,483.00', '63,388,483.00', '', '', '银行保证金', '银行保证金'], ['投资性房地产', '62,041,831.52', '62,041,831.52', '', '金融机构借款抵押', '67,653,392.10', '67,653,392.10', '67,653,392.10', '67,653,392.10', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['合计', '124,305,378.14', '124,305,378.14', '', '', '140,708,959.86', '140,708,959.86', '140,708,959.86', '140,708,959.86', '', '', '', '']]}
|
||||
2024-12-29 16:18:15,722|zzb_logger : INFO Task 解析表格201917 runs 0.69 seconds.
|
||||
2024-12-29 16:18:15,873|zzb_logger : INFO Task 解析表格201917 runs 0.79 seconds.
|
||||
2024-12-29 16:18:16,067|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
|
||||
2024-12-29 16:18:16,086|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:16,158|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:16,787|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39052)...
|
||||
2024-12-29 16:18:16,847|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35928)...
|
||||
2024-12-29 16:18:17,456|zzb_logger : INFO Task 解析表格201917 runs 0.61 seconds.
|
||||
2024-12-29 16:18:17,644|zzb_logger : INFO Task 解析表格201917 runs 0.86 seconds.
|
||||
2024-12-29 16:18:17,819|zzb_logger : INFO word表格中 table解析完成,任务ID:201917
|
||||
2024-12-29 16:18:17,985|zzb_logger : INFO 解析任务 201917 完成,耗时62.29 秒。
|
||||
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
|
||||
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:18:18,107|zzb_logger : INFO 开始表格指标抽取,任务ID:201917
|
||||
2024-12-29 16:18:20,187|zzb_logger : INFO 提取指标任务 0-10 (29656)...
|
||||
2024-12-29 16:18:21,575|zzb_logger : INFO 提取指标任务 10-20 (38952)...
|
||||
2024-12-29 16:18:22,849|zzb_logger : INFO 提取指标任务 20-30 (31900)...
|
||||
2024-12-29 16:18:24,192|zzb_logger : INFO 提取指标任务 30-40 (30420)...
|
||||
2024-12-29 16:18:25,554|zzb_logger : INFO 提取指标任务 40-50 (32448)...
|
||||
2024-12-29 16:18:26,909|zzb_logger : INFO 提取指标任务 50-60 (37708)...
|
||||
2024-12-29 16:18:28,305|zzb_logger : INFO 提取指标任务 60-70 (36136)...
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,936|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:29,637|zzb_logger : INFO 提取指标任务 70-80 (39120)...
|
||||
2024-12-29 16:18:42,814|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:46,511|zzb_logger : INFO 提取指标 40-50 runs 20.96 seconds.
|
||||
2024-12-29 16:18:54,027|zzb_logger : INFO 提取指标 70-80 runs 24.39 seconds.
|
||||
2024-12-29 16:19:17,236|zzb_logger : INFO 提取指标 60-70 runs 48.93 seconds.
|
||||
2024-12-29 16:19:20,151|zzb_logger : INFO 提取指标 30-40 runs 55.96 seconds.
|
||||
2024-12-29 16:19:40,383|zzb_logger : INFO 提取指标 50-60 runs 73.47 seconds.
|
||||
2024-12-29 16:20:06,573|zzb_logger : INFO 提取指标 0-10 runs 106.39 seconds.
|
||||
2024-12-29 16:20:44,937|zzb_logger : INFO 提取指标 10-20 runs 143.36 seconds.
|
||||
2024-12-29 16:20:50,959|zzb_logger : INFO 提取指标 20-30 runs 148.11 seconds.
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取完成,任务ID:201917
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取 201917 完成,耗时153.23 秒。
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
|
||||
2024-12-29 16:20:51,549|zzb_logger : INFO 目录黑名单为:[]
|
||||
2024-12-29 16:20:52,316|zzb_logger : INFO 向量配置数据查询 0.11 秒。
|
||||
2024-12-29 16:20:52,317|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
|
||||
2024-12-29 16:20:54,191|zzb_logger : INFO Run task 0-351 (41216)...
|
||||
2024-12-29 16:20:54,192|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:54,742|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:55,664|zzb_logger : INFO Run task 351-702 (16388)...
|
||||
2024-12-29 16:20:55,664|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:56,152|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:57,120|zzb_logger : INFO Run task 702-1053 (41796)...
|
||||
2024-12-29 16:20:57,120|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:57,611|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:58,818|zzb_logger : INFO Run task 1053-1404 (39320)...
|
||||
2024-12-29 16:20:58,818|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:59,324|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:00,159|zzb_logger : INFO Run task 1404-1755 (41868)...
|
||||
2024-12-29 16:21:00,159|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:00,887|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:01,473|zzb_logger : INFO Run task 1755-2106 (26816)...
|
||||
2024-12-29 16:21:01,473|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:02,171|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:02,832|zzb_logger : INFO Run task 2106-2457 (32120)...
|
||||
2024-12-29 16:21:02,832|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:03,703|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO Run task 2457-2815 (38332)...
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:04,886|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:23:00,285|zzb_logger : INFO 所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:23:00,286|zzb_logger : INFO 启动指标归一化任务ID:201917
|
||||
2024-12-29 16:23:00,286|zzb_logger : INFO 向量更新时间 127.97 秒。
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据查询 0.17 秒。
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO update_ori_measure方法走的是半年报
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据更新 0.00 秒。
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 更新数据写入 0.05 秒。
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 归一化完成任务ID:201917
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 任务 201917 完成,耗时344.83 秒。
|
||||
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
|
||||
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:23:00,821|zzb_logger : INFO 任务 201917 完成
|
|
@ -427,19 +427,18 @@ def process_text_content(file_id,texts,tables,full_texts,type =0):
|
|||
"type" : "text",
|
||||
'content' : line_text,
|
||||
}},conn,cursor,"word_parse_process")
|
||||
|
||||
# 给慎用词校验用
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'text',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "text",
|
||||
'content': line_text,
|
||||
}}, conn, cursor, "word_parse_data")
|
||||
# 给慎用词校验用
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'text',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "text",
|
||||
'content': line_text,
|
||||
}}, conn, cursor, "word_parse_data")
|
||||
|
||||
table_name = "word_text_info"
|
||||
if type == 1:
|
||||
|
@ -450,6 +449,22 @@ def process_text_content(file_id,texts,tables,full_texts,type =0):
|
|||
'page_num' : t["index"],
|
||||
'text' : line_text
|
||||
},conn,cursor, table_name)
|
||||
|
||||
|
||||
for t in tables:
|
||||
page_num = t["index"]
|
||||
for lines in t["data"]:
|
||||
lines = list(set(lines))
|
||||
for line in lines:
|
||||
if len(line) == 0:
|
||||
continue
|
||||
db_service_word.batch_insert_page_text({
|
||||
'file_id': file_id,
|
||||
'page_num' : page_num,
|
||||
'text' : line
|
||||
},conn,cursor,"word_text_info")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
applog.error(f'文本处理异常{e}')
|
||||
|
||||
|
@ -519,12 +534,12 @@ def get_table_measure(file_id, word_tables, record_range):
|
|||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
t = word_tables[index]
|
||||
t = word_tables[index][0]
|
||||
measure_obj =[]
|
||||
data_dict = {}
|
||||
measure_list = []
|
||||
try:
|
||||
arr = np.array(t['data'])
|
||||
arr = np.array(t["data"])
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
|
@ -679,7 +694,7 @@ def update_measure_data(file_id,file_path,parent_table_pages):
|
|||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
applog.info(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service_word.delete_to_run(conn,cursor,file_id)
|
||||
# db_service_word.delete_to_run(conn,cursor,file_id)
|
||||
db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
|
||||
|
||||
# #指标归一化处理
|
||||
|
@ -692,15 +707,39 @@ def update_measure_data(file_id,file_path,parent_table_pages):
|
|||
|
||||
def merge_consecutive_arrays(word_info):
|
||||
merged_objects = []
|
||||
temp_list = []
|
||||
|
||||
for info_obj in word_info:
|
||||
try:
|
||||
if info_obj['type'] == 'table':
|
||||
# 如果对象是表格,将其元素添加到临时列表中
|
||||
merged_objects.append(info_obj)
|
||||
data = info_obj['data']
|
||||
if not data:
|
||||
continue
|
||||
first_row = data[0]
|
||||
if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0:
|
||||
temp_list.append(info_obj)
|
||||
elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
merged_objects.append(temp_list)
|
||||
temp_list = []
|
||||
temp_list.append(info_obj)
|
||||
elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
temp_data = temp_list[-1]['data']
|
||||
temp_data = list(temp_data)
|
||||
for row in list(info_obj['data']):
|
||||
temp_data.append(row)
|
||||
info_obj['data'] = temp_data
|
||||
temp_list.clear()
|
||||
temp_list.append(info_obj)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
|
||||
applog.error(f"解析数据错误: {e}")
|
||||
|
||||
if temp_list:
|
||||
merged_objects.append(temp_list)
|
||||
|
||||
return merged_objects
|
||||
|
||||
def merge_consecutive_arrays_v1(pdf_info):
|
||||
|
@ -775,11 +814,10 @@ def start_table_measure_job(file_id):
|
|||
records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
# get_table_measure(file_id,word_tables,record_range,)
|
||||
p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
|
|
|
@ -252,8 +252,8 @@ def append_to_file(file_path, text):
|
|||
|
||||
if __name__ == "__main__":
|
||||
current_directory = os.getcwd()
|
||||
docx_relative_path = 'file/docx/101.docx'
|
||||
file_relative_path = 'file/docx/test1.txt'
|
||||
docx_relative_path = '..\\file\\docx\\101.docx'
|
||||
file_relative_path = '..\\file\\docx\\test1.txt'
|
||||
docx_path = os.path.join(current_directory, docx_relative_path)
|
||||
file_path = os.path.join(current_directory, file_relative_path)
|
||||
try:
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
"","","适用(如)","",""
|
||||
"非流动性资产处置损益,包括已计提资产减值准备的冲销部分","-236316.65","","232448.97","-46760.24"
|
||||
"计入当期损益的政府补助,但与公司正常经营业务密切相关、符合国家政策规定、按照确定的标准享有、对公司损益产生持续影响的政府补助除外","4471155.00","","9188174.79","13052067.83"
|
||||
"除同公司正常经营业务相关的有效套期保值业务外,非金融企业持有金融资产和金融负债产生的公允价值变动损益以及处置金融资产和金融负债产生的损益","13099776.76","","14132376.82","7256455.55"
|
||||
"计入当期损益的对非金融企业收取的资金占用费","","","",""
|
||||
"委托他人投资或管理资产的损益","","","",""
|
||||
"对外委托贷款取得的损益","","","",""
|
||||
"因不可抗力因素,如遭受自然灾害而产生的各项资产损失","-3826330.90","","",""
|
||||
"单独进行减值测试的应收款项减值准备转回","","","",""
|
||||
"企业取得子公司、联营企业及合营企业的投资成本小于取得投资时应享有被投资单位可辨认净资产公允价值产生的收益","","","",""
|
||||
"同一控制下企业合并产生的子公司期初至合并日的当期净损益","","","",""
|
||||
"非货币性资产交换损益","","","",""
|
||||
"债务重组损益","","","",""
|
||||
"企业因相关经营活动不再持续而发生的一次性费用,如安置职工的支出等","","","",""
|
||||
"因税收、会计等法律、法规的调整对当期损益产生的一次性影响","","","",""
|
||||
"因取消、修改股权激励计划一次性确认的股份支付费用","","","",""
|
||||
"对于现金结算的股份支付,在可行权日之后,应付职工薪酬的公允价值变动产生的损益","","","",""
|
||||
"采用公允价值模式进行后续计量的投资性房地产公允价值变动产生的损益","","","",""
|
||||
"交易价格显失公允的交易产生的收益","","","",""
|
||||
"与公司正常经营业务无关的或有事项产生的损益","","","",""
|
||||
"受托经营取得的托管费收入","","","",""
|
||||
"除上述各项之外的其他营业外收","-11648682.96","","-529596.32","34351.19"
|
|
|
@ -1,5 +0,0 @@
|
|||
"入和支出","","","",""
|
||||
"其他符合非经常性损益定义的损益项目","","","-757389.60","-729432.00"
|
||||
"减:所得税影响额","278940.19","","3339902.20","2935002.34"
|
||||
"少数股东权益影响额(税后)","","","",""
|
||||
"合计","1580661.06","","18926112.46","16631679.99"
|
|
|
@ -1,3 +0,0 @@
|
|||
"项目名称","期初余额","期末余额","当期变动","对当期利润的影响金额"
|
||||
"交易性金融资产","390568609.77","175421746.58","-215146863.19","-146863.19"
|
||||
"合计","390568609.77","175421746.58","-215146863.19","-146863.19"
|
|
|
@ -1 +0,0 @@
|
|||
"","","","","","","","","","",""
|
|
|
@ -1,5 +0,0 @@
|
|||
"序号","评价维度","指标","公司产品注册标准","2020版中国药典标准","欧洲药典9.0版标准"
|
||||
"123","杂质含量","卵清蛋白含量","≤60ng/mL","≤200ng/mL","≤500ng/mL"
|
||||
"","","蛋白质含量","≤360μg/mL","≤400μg/mL","≤600μg/mL"
|
||||
"","","游离甲醛含量","≤25μg/mL","≤50μg/mL","≤200μg/mL"
|
||||
"4","有效成分纯度","蛋白质含量/血凝素含量","≤3.0","≤4.5","≤6.0"
|
|
|
@ -1,8 +0,0 @@
|
|||
"","本年新增","本年新增","累计数量","累计数量"
|
||||
"","申请数(个)","获得数(个)","申请数(个)","获得数(个)"
|
||||
"发明专利","6","3","16","6"
|
||||
"实用新型专利","2","","12","10"
|
||||
"外观设计专利","","","",""
|
||||
"软件著作权","","","",""
|
||||
"其他","","","",""
|
||||
"合计","8","3","28","16"
|
|
|
@ -1,6 +0,0 @@
|
|||
"","本年度","上年度","变化幅度(%)"
|
||||
"费用化研发投入","15471820.82","32409476.90","-52.26"
|
||||
"资本化研发投入","15990870.05","13732758.96","16.44"
|
||||
"研发投入合计","31462690.87","46142235.86","-31.81"
|
||||
"研发投入总额占营业收入比例(%)","23.38","14.49","增加8.89个百分点"
|
||||
"研发投入资本化的比重(%)","50.82","29.76","增加21.06个百分点"
|
|
|
@ -1,12 +0,0 @@
|
|||
"","","资规模","金额","金额","阶段性成果","到目标","水平","应用前景"
|
||||
"1","冻干人用狂犬病疫苗(Vero细胞)","10000.00","1599.09","11578.76","注册申请中","获得生产批件","国内领先","用于预防狂犬病"
|
||||
"2","四价流感病毒裂解疫苗(儿童)","33000.00","410.69","1481.50","III期临床试验前期准备中","获得生产批件","国内领先","用于预防流行性感冒"
|
||||
"3","23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","22980.00","123.49","631.25","临床前研究","获得生产批件","国内领先","用于预防肺炎"
|
||||
"4","冻干水痘减毒活疫苗","31975.00","225.03","946.69","临床前研究","获得生产批件","国内领先","用于预防水痘"
|
||||
"5","四价流感病毒裂解疫苗(高剂量)","11745.00","110.64","1961.90","临床前研究","获得生产批件","国内领先","用于预防流行性感冒"
|
||||
"6","重组带状疱疹疫苗","31975.00","168.99","429.68","临床前研究","获得生产批件","国内领先","用于预防带状疱疹"
|
||||
"7","冻干人用狂犬病疫苗(MRC-5细胞)","27915.00","33.77","200.46","临床前研究","获得生产批件","国内领先","用于预防狂犬病"
|
||||
"8","多价手足口病疫苗","29910.00","33.77","199.29","临床前研究","获得生产批件","国内领先","用于预防手足口病"
|
||||
"9","注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","38910.00","33.49","350.71","临床前研究","获得生产批件","新药","实体瘤治疗"
|
||||
"10","在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究","300.00","54.38","225.80","临床研究完成","获得注册批件","国内领先","预防流行性感冒"
|
||||
"合计","/","238710.00","2793.34","18006.04","/","/","/","/"
|
|
|
@ -1,6 +0,0 @@
|
|||
"基本情况","基本情况","基本情况"
|
||||
"","本期数","上期数"
|
||||
"公司研发人员的数量(人)","60","58"
|
||||
"研发人员数量占公司总人数的比例(%)","13.10","12.24"
|
||||
"研发人员薪酬合计","1012.67","932.12"
|
||||
"研发人员平均薪酬","16.88","16.07"
|
|
|
@ -1,14 +0,0 @@
|
|||
"研发人员学历结构","研发人员学历结构"
|
||||
"学历结构类别","学历结构人数"
|
||||
"博士研究生","3"
|
||||
"硕士研究生","6"
|
||||
"本科","40"
|
||||
"专科","10"
|
||||
"高中及以下","1"
|
||||
"研发人员年龄结构","研发人员年龄结构"
|
||||
"年龄结构类别","年龄结构人数"
|
||||
"30岁以下(不含30岁)","29"
|
||||
"30-40岁(含30岁,不含40岁)","20"
|
||||
"40-50岁(含40岁,不含50岁)","5"
|
||||
"50-60岁(含50岁,不含60岁)","4"
|
||||
"60岁及以上","2"
|
|
|
@ -1,10 +0,0 @@
|
|||
"科目","本期数","上年同期数","变动比例(%)"
|
||||
"营业收入","134591377.00","318486074.97","-57.74"
|
||||
"营业成本","29864436.32","50588057.11","-40.97"
|
||||
"销售费用","77073744.58","107494355.33","-28.30"
|
||||
"管理费用","58638054.44","60622550.89","-3.27"
|
||||
"财务费用","42981.30","-355527.32","不适用"
|
||||
"研发费用","15471820.82","32409476.90","-52.26"
|
||||
"经营活动产生的现金流量净额","80904692.08","38595320.99","109.62"
|
||||
"投资活动产生的现金流量净额","-187707765.08","112695639.52","-266.56"
|
||||
"筹资活动产生的现金流量净额","2517734.96","-13250290.31","不适用"
|
|
|
@ -1,2 +0,0 @@
|
|||
"主营业务分行业情况"
|
||||
"营业收入营业成本毛利率毛利率分行业营业收入营业成本比上年增比上年增比上年(%)减(%)减(%)增减"
|
|
|
@ -1,11 +0,0 @@
|
|||
"(%)","(%)","(%)","(%)","(%)","(%)","(%)"
|
||||
"减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点"
|
||||
"主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况"
|
||||
"分产品","营业收入","营业成本","毛利率(%)","营业收入比上年增减(%)","营业成本比上年增减(%)","毛利率比上年增减(%)"
|
||||
"四价流感病毒裂解疫苗","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
|
||||
"主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况"
|
||||
"分地区","营业收入","营业成本","毛利率(%)","营业收入比上年增减(%)","营业成本比上年增减(%)","毛利率比上年增减(%)"
|
||||
"国内","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
|
||||
"主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况"
|
||||
"销售模式","营业收入","营业成本","毛利率(%)","营业收入比上年增减(%)","营业成本比上年增减(%)","毛利率比上年增减(%)"
|
||||
"直销","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
|
|
|
@ -1,2 +0,0 @@
|
|||
"主要产品","单位","生产量","销售量","库存量","生产量比上年增减(%)","销售量比上年增减(%)","库存量比上年增减(%)"
|
||||
"四价流感病毒裂解疫苗","瓶","2945705","1381358","1152015","-53.20","-51.74","-63.93"
|
|
|
@ -1,11 +0,0 @@
|
|||
"分行业","成本构成项目","本期金额","本期占总成本比例(%)","上年同期金额","上年同期占总成本比例(%)","本期金额较上年同期变动比例(%)","情况说明"
|
||||
"生物制药","直接材料","11118814.64","37.23","12840750.18","25.38","-13.41",""
|
||||
"","直接人工","1506181.29","5.04","2408448.11","4.76","-37.46","销量减少所致"
|
||||
"","制造费用","9877150.51","33.07","16580810.13","32.78","-40.43",""
|
||||
"","运输费用","7362289.88","24.66","18758048.69","37.08","-60.75",""
|
||||
"分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况"
|
||||
"分产品","成本构成项目","本期金额","本期占总成本比例(%)","上年同期金额","上年同期占总成本比例(%)","本期金额较上年同期变动比例(%)","情况说明"
|
||||
"四价流感病毒裂解疫苗","直接材料","11118814.64","37.23","12840750.18","25.38","-13.41",""
|
||||
"","直接人工","1506181.29","5.04","2408448.11","4.76","-37.46","销量减少所致"
|
||||
"","制造费用","9877150.51","33.07","16580810.13","32.78","-40.43",""
|
||||
"","运输费用","7362289.88","24.66","18758048.69","37.08","-60.75",""
|
|
|
@ -1,5 +0,0 @@
|
|||
"2","客户二","509.71","3.79","否"
|
||||
"3","客户三","318.08","2.36","否"
|
||||
"4","客户四","309.50","2.30","否"
|
||||
"5","客户五","256.49","1.91","否"
|
||||
"合计","/","2214.00","16.45","/"
|
|
|
@ -1,7 +0,0 @@
|
|||
"序号","供应商名称","采购额","占年度采购总额比例(%)","是否与上市公司存在关联关系"
|
||||
"1","供应商一","1599.68","15.59","否"
|
||||
"2","供应商二","1084.77","10.57","否"
|
||||
"3","供应商三","941.52","9.18","否"
|
||||
"4","供应商四","885.84","8.63","否"
|
||||
"5","供应商五","849.64","8.28","否"
|
||||
"合计","/","5361.45","52.25","/"
|
|
|
@ -1,5 +0,0 @@
|
|||
"科目","本期数","上年同期数","变动比例(%)"
|
||||
"销售费用","77073744.58","107494355.33","-28.30"
|
||||
"管理费用","58638054.44","60622550.89","-3.27"
|
||||
"财务费用","42981.30","-355527.32","不适用"
|
||||
"研发费用","15471820.82","32409476.90","-52.26"
|
|
|
@ -1,3 +0,0 @@
|
|||
"科目","本期数","上年同期数","变动比例(%)"
|
||||
"经营活动产生的现金流量净额","80904692.08","38595320.99","109.62"
|
||||
"投资活动产生的现金流量净额","-187707765.08","112695639.52","-266.56"
|
|
|
@ -1,13 +0,0 @@
|
|||
"项目名称","本期期末数","本期期末数占总资产的比例(%)","上期期末数","上期期末数占总资产的比例(%)","本期期末金额较上期期末变动比例(%)","情况说明"
|
||||
"货币资金","70443588.78","4.32","174728926.82","9.56","","-59.68说明1"
|
||||
"交易性金融资产","175421746.58","10.75","390568609.77","21.38","","-55.09说明2"
|
||||
"预付款项","2825253.64","0.17","5735966.10","0.31","","-50.74说明3"
|
||||
"其他应收款","479099.87","0.03","542645.12","0.03","-11.71",""
|
||||
"在建工程","649464436.15","39.81","619862948.00","33.93","4.78",""
|
||||
"长期待摊费用","248564.85","0.02","1626952.89","0.09","","-84.72说明4"
|
||||
"递延所得税资产","33313943.01","2.04","17752280.68","0.97","","87.66说明5"
|
||||
"其他非流动资产","3358975.00","0.21","3888619.41","0.21","-13.62",""
|
||||
"短期借款","64057597.23","3.93","42041861.11","2.30","","52.37说明6"
|
||||
"应付账款","86670216.00","5.31","98922415.32","5.42","-12.39",""
|
||||
"合同负债","0.00","0.00","50000.00","0.00","","-100.00说明7"
|
||||
"应交税费","1046668.08","0.06","1168680.25","0.06","-10.44",""
|
|
|
@ -1,2 +0,0 @@
|
|||
"细分行业","主要治疗领域","药(产)品名称","注册分类","适应症或功能主治","是否处方药","是否属于中药保护品种(如涉及)","发明专利起止期限(如适用)","是否属于报告期内推出的新药(产)品","是否纳入国家基药目录","是否纳入国家医保目录","是否纳入省级医保目录"
|
||||
"生物制药","预防流行性感冒","四价流感病毒裂解疫苗","预防用生物制品","预防流行性感冒","否","否","2020-05-05至2037-08-23","否","否","否","否"
|
|
|
@ -1,7 +0,0 @@
|
|||
"研发项目(含一致性评价项目)","药(产)品名称","注册分类","适应症或功能主治","是否处方药","是否属于中药保护品种(如涉及)","研发(注册)所处阶段"
|
||||
"冻干人用狂犬病疫苗(Vero细胞)","冻干人用狂犬病疫苗(Vero细胞)","预防用生物制品3.3类","预防狂犬病","否","否","申报注册"
|
||||
"四价流感病毒裂解疫苗(儿童)","四价流感病毒裂解疫苗(儿童)","预防用生物制品3.3类","预防流行性感冒","否","否","临床试验"
|
||||
"23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","预防用生物制品3.3类","预防肺炎","否","否","临床前研究"
|
||||
"冻干水痘减毒活疫苗","冻干水痘减毒活疫苗","预防用生物制品3.3类","预防水痘","否","否","临床前研究"
|
||||
"四价流感病毒裂解疫苗(高剂量)","四价流感病毒裂解疫苗(高剂量)","预防用生物制品3.2类","预防流行性感冒","否","否","临床前研究"
|
||||
"重组带状疱疹疫苗","重组带状疱疹疫苗","预防用生物制品3.3类","预防带状疱疹","否","否","临床前研究"
|
|
|
@ -1,3 +0,0 @@
|
|||
"冻干人用狂犬病疫苗(MRC-5细胞)","冻干人用狂犬病疫苗(MRC-5细胞)","预防用生物制品3.3类","预防狂犬病","否","否","临床前研究"
|
||||
"多价手足口病疫苗","多价手足口病疫苗","预防用生物制品1.4类","预防多价手足口病","否","否","临床前研究"
|
||||
"注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","治疗用生物制品1类","实体瘤治疗","否","否","临床前研究"
|
|
|
@ -1,10 +0,0 @@
|
|||
"同行业可比公司","研发投入金额","研发投入占营业收入比例(%)","研发投入占净资产比例(%)","研发投入资本化比重(%)"
|
||||
"长春百克生物科技股份有限公司","19874.22","10.89","4.94","5.74"
|
||||
"云南沃森生物技术股份有限公司","91061.04","22.14","9.74","14.74"
|
||||
"华兰生物疫苗股份有限公司","9321.33","3.87","1.49","2.26"
|
||||
"康希诺生物股份公司","66167.10","185.3","12.51","3.58"
|
||||
"北京万泰生物药业股份有限公司","129251.30","23.45","10.03","7.32"
|
||||
"同行业平均研发投入金额","同行业平均研发投入金额","53136.88","53136.88","53136.88"
|
||||
"公司报告期内研发投入占营业收入比例(%)","公司报告期内研发投入占营业收入比例(%)","23.38","23.38","23.38"
|
||||
"公司报告期内研发投入占净资产比例(%)","公司报告期内研发投入占净资产比例(%)","2.29","2.29","2.29"
|
||||
"公司报告期内研发投入资本化比重(%)","公司报告期内研发投入资本化比重(%)","50.82","50.82","50.82"
|
|
|
@ -1,11 +0,0 @@
|
|||
"研发项目","研发投入金额","研发投入费用化金额","研发投入资本化金额","研发投入占营业收入比例(%)","本期金额较上年同期变动比例(%)","情况说明"
|
||||
"冻干人用狂犬病疫苗(Vero细胞)","1599.09","","1599.09","11.88","16.44",""
|
||||
"四价流感病毒裂解疫苗(儿童)","410.69","410.69","","3.05","349.70","本报告期该项目已完成Ⅰ期临床试验,正在进行III期临床试验前期准备工作,研发投入同比增加。"
|
||||
"23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","123.49","123.49","","0.92","20.09",""
|
||||
"冻干水痘减毒活疫苗","225.03","225.03","","1.67","-18.47",""
|
||||
"四价流感病毒裂解疫苗(高剂量)","110.64","110.64","","0.82","-92.85","本报告期该项目处于临床前研究阶段,研发投入同比减少。"
|
||||
"重组带状疱疹疫苗","168.99","168.99","","1.26","80.87","本报告期该项目处于临床前研究阶段,技术服务费研发投入同比增加。"
|
||||
"冻干人用狂犬病疫苗(MRC-5细胞)","33.77","33.77","","0.25","-55.03","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
|
||||
"多价手足口病疫苗","33.77","33.77","","0.25","-63.65","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
|
||||
"注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","33.49","33.49","","0.25","-57.64","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
|
||||
"在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究","54.38","54.38","","0.40","-24.28","本报告期该项目临床研究完成,研发投入同比减少。"
|
|
|
@ -1,9 +0,0 @@
|
|||
"具体项目名称","本期发生额","本期发生额占销售费用总额比例(%)"
|
||||
"薪酬及社保费用","862.50","11.19"
|
||||
"差旅费","66.75","0.87"
|
||||
"业务招待费","35.21","0.46"
|
||||
"销售服务费","6469.41","83.93"
|
||||
"办公费","6.33","0.08"
|
||||
"会议费","212.51","2.76"
|
||||
"其他","54.66","0.71"
|
||||
"合计","7707.37","100.00"
|
|
|
@ -1,8 +0,0 @@
|
|||
"同行业可比公司","销售费用","销售费用占营业收入比例(%)"
|
||||
"长春百克生物科技股份有限公司","64716.89","35.47"
|
||||
"云南沃森生物技术股份有限公司","151957.55","36.94"
|
||||
"华兰生物疫苗股份有限公司","94899.25","39.37"
|
||||
"康希诺生物股份公司","35339.54","98.97"
|
||||
"北京万泰生物药业股份有限公司","159509.44","28.94"
|
||||
"公司报告期内销售费用总额","公司报告期内销售费用总额","7707.37"
|
||||
"公司报告期内销售费用占营业收入比例(%)","公司报告期内销售费用占营业收入比例(%)","57.26"
|
|
|
@ -1,3 +0,0 @@
|
|||
"资产类别","期初数","本期公允价值变动损益","计入权益的累计公允价值变动","本期计提的减值","本期购买金额","本期出售/赎回金额","其他变动","期末数"
|
||||
"其他","390568609.77","-146863.19","","","","215000000.00","","175421746.58"
|
||||
"合计","390568609.77","-146863.19","","","","215000000.00","","175421746.58"
|
|
|
@ -1,3 +0,0 @@
|
|||
"备查文件目录","载有公司负责人、主管会计工作负责人、会计机构负责人(会计主管人员)签名并盖章的财务报表"
|
||||
"","载有会计师事务所盖章、注册会计师签名并盖章的审计报告原件"
|
||||
"","报告期内公开披露过的所有公司文件的正本及公告的原稿。"
|
|
|
@ -1,2 +0,0 @@
|
|||
"会议届次","召开日期","决议刊登的指定网站的查询索引","决议刊登的披露日期","会议决议"
|
||||
"2022年年度股东大会","2023年5月10日","www.sse.com.cn","2023年5月11日","议案全部审议通过"
|
|
|
@ -1,10 +0,0 @@
|
|||
"姓名","职务","性别","年龄","任期起始日期","任期终止日期","","年初持股数年末持股数","年度内股份增减变动量","增减变动原因","报告期内从公司获得的税前报酬总额(万元)","是否在公司关联方获取报酬"
|
||||
"余军","董事长、核心技术人员","男","55","2020-06-15","2026-05-10","27049291","37869007","10819716","资本公积金转增股本","128.87否",""
|
||||
"张良斌","董事","男","49","2020-06-15","2026-05-10","27049291","37869008","10819717","资本公积金转增股本","","0是"
|
||||
"聂申钱","董事","男","76","2020-06-15","2026-05-10","3381159","4733623","1352464","资本公积金转增股本","","0是"
|
||||
"夏建国","董事、副总经理","男","51","2020-06-15","2026-05-10","2086865","2921611","","834746不适用","88.87否",""
|
||||
"邵蓉","独立董事女","","62","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
|
||||
"管建强","独立董事男","","66","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
|
||||
"程华(辞职)","独立董事女","","45","2020-06-15","2024-01-10","0","0","","0不适用","","12否"
|
||||
"魏大昌","监事会主席","男","56","2020-06-15","2026-05-10","0","0","","0不适用","62.78否",""
|
||||
"余晖晟","职工监事男","","28","2020-06-15","2026-05-10","0","0","","0不适用","8.79否",""
|
|
|
@ -1,11 +0,0 @@
|
|||
"黄玲","监事","女","58","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
|
||||
"张建辉","总经理","男","66","2023-10-27","2026-05-10","4057394","5680352","1622958","资本公积金转增股本","30.50是",""
|
||||
"任晚琼(离职)","副总经理女","","54","2020-06-15","2023-10-27","0","0","","0不适用","88.86否",""
|
||||
"樊长勇","副总经理男","","45","2020-06-15","2026-05-10","0","0","","0不适用","56.77否",""
|
||||
"田国雄","副总经理男","","45","2022-05-30","2026-05-10","0","0","","0不适用","118.03否",""
|
||||
"滕红刚(离职)","副总经理男","","52","2022-05-30","2023-05-10","0","0","","0不适用","29.31否",""
|
||||
"黄强","财务总监男","","44","2022-04-08","2026-05-10","0","0","","0不适用","63.17否",""
|
||||
"李志刚(离职)","副总经理男","","42","2023-10-27","2023-12-22","0","0","","0不适用","41.93否",""
|
||||
"赵巍(离职)","副总经理男","","45","2023-10-27","2023-12-22","0","0","","0不适用","18.29否",""
|
||||
"吴建华","核心技术人员","男","49","2009-10-01","-","0","0","","0不适用","40.91否",""
|
||||
"合计","/","/","/","/","/","","","","/","825.08","/"
|
|
|
@ -1,3 +0,0 @@
|
|||
"姓名","主要工作经历"
|
||||
"余军","1992年8月至1993年10月任临川中心血站技术员;1993年11月至2000年12月任博雅生物制药股份有限公司生产经理;2001年1月至2002年5月任北京耀华生物技术有限公司总工程师;2002年6月至2005年7月任广东佰易药业有限公司副总经理;2005年10月至2014年3月任同路生物制药有限公司副总经理;2014年6月至2015年5月任海南中和药业有限公司副总经理;2015年6月至2023年10月担任公司核心技术人员、董事长、总经理,2023年10月至今担任公司核心技术人员、董事长。"
|
||||
"张良斌","1999年2月至2000年5月任博雅生物制药股份有限公司出纳;2000年5月至2001年10月任广东康之选医药连锁有限公司配送中心经理;2001年10月至2005年12月任广东佰易药业有限公司销售部经理;2006年1月至今任同路生物制药有限公司副总经理;2017"
|
|
|
@ -1,12 +0,0 @@
|
|||
"","年2月至今任浙江海康生物制品有限责任公司董事;2016年8月至今担任广东上量投资有限公司监事;2015年6月至今任公司董事。"
|
||||
"聂申钱","1969年2月至1987年11月任中国人民解放军海军航空兵部队干部;1987年12月至1993年8月任中国预防医学科学院中预公司经理;1993年9月至2011年9月任中信医药实业有限公司总经理;2011年10月至2013年12月任上药科园信海医药有限公司党委书记;2014年7月至2016年11月任海南中和药业有限公司董事兼总经理;2016年11月至今任海南中和药业股份有限公司董事、高级顾问;2020年4月至今担任海南妙峰山健康产业有限公司执行董事兼总经理;2015年6月至今任公司董事。"
|
||||
"夏建国","1995年8月至1998年8月任南京药械厂制药机械研究所设计师;1998年8月至2000年12月任博雅生物制药股份有限公司冻干技师;2001年1月至2002年8月任深圳海普瑞生物技术有限公司工程部主管;2002年9月至2005年12月任广东佰易药业有限公司工程部经理;2006年1月至2015年5月任同路生物制药有限公司项目总监;2015年6月至今担任公司董事、副总经理。"
|
||||
"邵蓉","2020年6月至今担任公司独立董事。现就职于中国药科大学,任国家药物政策与医药产业经济研究中心执行副主任,教授、博士生导师,兼任天境生物(I-Mab)独立董事、江苏当代国安律师事务所执业律师、中国药学会理事、中国药品监督管理研究会政策与法规专业委员会主任委员、中国药促会监事等职。"
|
||||
"管建强","2020年6月至今担任公司独立董事。现担任华东政法大学教授和博士生导师,兼任江苏图南合金股份有限公司独立董事。"
|
||||
"程华(辞职)","2020年6月至2023年12月,担任公司独立董事。现担任财政部会计准则委员会高级会计师,兼任中国财政科学研究院硕士生导师、湘财股份有限公司独立董事、悦康药业集团股份有限公司独立董事、山东步长制药股份有限公司独立董事等职。"
|
||||
"魏大昌","1988年11月至1993年9月,任成都军区后勤部供血站精制组长;1993年9月至1998年11月,任江西省博达生物工程研究所工程师;1998年11月至2005年3月,任广东湛江双林生物制药有限公司总经理助理兼生产部部长;2005年3月至2005年10月,任广东佰易药业有限公司生产部经理;2005年11月至2016年6月,任同路生物制药有限公司生产部经理;2016年6月至2018年5月,任中科生物制药有限公司血制项目总监;2018年6月至2019年6月,任通盈生物制药有限公司血制项目总监;2019年7月至今任公司包装部经理(总监),2020年6月至今任公司监事会主席。"
|
||||
"余晖晟","2017年2月至今为公司车间员工,2020年6月至今任公司职工代表监事。"
|
||||
"黄玲","1988年9月至1993年10月任北京东风制药厂技术员;1993年10月至1995年8月,任北京亚都生物公司技术员;1997年10月至2003年3月,任北京巨能公司研究员;2003年3月至今任北京秦脉医药咨询有限责任公司咨询师;2020年6月至今任公司监事。"
|
||||
"张建辉","1975年1月至1997年12月在江西省抚州地区煤炭公司任职;1998年1月至2007年1月任江西省崇仁县单采血浆站站长;2007年2月至2009年1月任博雅生物制药股份有限公司副总经理;2009年2月至2011年12月任同路生物制药有限公司血浆部副总经理;2012年1月至今任郴州市云鼎房地产有限公司董事长;2020年5月至今任福建省宏冠房地产开发有限公司董事长,2023年10月至今担任公司总经理。"
|
||||
"任晚琼(离职)","1993年8月至2010年7月任职于河南欣泰药业有限公司,历任质检科职员、质检科主任、质量保证部部长、副总经理;2010年8月至2015年2月任河南远大生物制药有限公司副总经理;2015年6月至2019年6月任公司质量总监;2019年6月至2023年10月任公司副总经理。"
|
||||
"樊长勇","2001年7月至2004年1月任上海九鼎粉体材料有限公司技术员;2004年1月至2007年7月任上海界龙实业股份有限公司高级经理;2007年7月至2009年8月任国信证券股份有限公司投资银行高级经理;2009年9月至2015年6月任中信证券股份有限公司投资银行"
|
|
|
@ -1,7 +0,0 @@
|
|||
"","委员会副总裁(VP)、高级副总裁(SVP)、保荐代表人;2016年4月至2020年4月任上海莱士血液制品股份有限公司董事长助理,2018年9月至2020年4月任同方莱士医药产业投资(广东)有限公司总经理;2020年5月至2024年1月任公司副总经理兼董事会秘书。"
|
||||
"田国雄","2002年7月至2005年12月先后任广东佰易药业有限公司地区商务经理、地区销售经理,2006年1月至2022年3月先后任同路生物制药有限公司地区销售经理、大区销售经理、大区销售总监。2022年5月起任公司副总经理。"
|
||||
"滕红刚(离职)","1995年9月至2000年7月在长春生物制品所病毒研究室工作,2003年7月在长春生物制品所获得免疫学硕士学位,2006年7月在吉林大学生命科学学院获得生物化学与分子生物学专业博士学位,2006年10月至2007年4月任中国科学院广州生物医药与健康研究院研究助理,2007年6月至2009年6月任吉林亚泰生物药业股份有限公司副总经理,2009年6月至2011年10月任鸿达生物药业长春股份有限公司副总经理,2011年12月至2015年5月任长春卫尔赛生物药业有限公司生产总监,2015年8月至2016年5月任霍普金斯医药研究院长春分院院长,2016年8月至2022年3月先后任辽宁依生生物制药有限公司副总经理、总经理。2022年5月至2023年5月任公司副总经理。"
|
||||
"黄强","2002年7月至2016年3月历任河南神火煤电股份有限公司(000933.SZ)财务部科员、副科长、科长;2016年4月至2021年3月历任海南中和药业股份有限公司证券事务代表、董事会办公室主任、财务副总监、总经理助理;2021年4月至2022年4月任江苏金迪克生物技术股份有限公司财务副总监。2022年4月起任公司财务总监。"
|
||||
"李志刚(离职)","2008年3月至2010年3月任牛津大学高级研究助理;2010年8月至2012年10月任北京必威安泰生物科技有限公司研发项目负责人;2012年10月至2018年3月任北京生物制品研究所有限公司经理、副主任等职;2018年4月至2019年7月任中国生物技术股份有限公司部长助理;2019年8月至2021年5月任北京民海生物科技有限公司质量合规总监;2021年5月至2022年4月任斯微(上海)生物科技有限公司副总裁;2022年5月至2023年10月任君拓生物医药科技(海南)有限公司副总裁。2023年10月至2023年12月担任公司副总经理。"
|
||||
"赵巍(离职)","2000年10月至2002年9月任武汉海特生物制药股份有限公司员工;2002年9月至2004年6月就读于华中科技大学同济医学院,获学士学位;2005年9月至2007年6月就读于武汉大学,获硕士学位;2007年7月至2023年3月历任武汉生物制品研究所有限责任公司流感疫苗课题组第二课题负责人、病毒性疫苗研究二室主任、流感病毒疫苗室主任。2023年3月至2023年10月任上海君拓生物医药科技有限公司总裁助理(兼无锡君和生物医药科技有限公司副总经理)。2023年10月至2023年12月担任公司副总经理。"
|
||||
"吴建华","1998年6月至2003年3月,任浙江天元生物药业股份有限公司生产技术员、研发助理工程师;2003年3月至2009年9月,任北京金迪克生物技术研究所研发主管;2009年10月至今任公司质量控制部经理。吴建华主要负责公司四价流感病毒裂解疫苗的临床前和临床试验研究、生产工艺研究和质量控制研究工作,以及公司冻干人用狂犬病疫苗(Vero细胞)、四价流感病毒裂解疫苗(儿童)、四价流感病毒裂解疫苗(高剂量)、冻干水痘减毒活疫苗、冻干带状疱疹减毒活疫苗等在研项目的质量控制研究工作。"
|
|
|
@ -1,4 +0,0 @@
|
|||
"任职人员姓名","股东单位名称","在股东单位担任的职务","","任期起始日期任期终止日期"
|
||||
"余军","泰州同泽","执行事务合伙人","2020年5月",""
|
||||
"张良斌","泰州同人","执行事务合伙人","2020年5月",""
|
||||
"在股东单位任职情况的说明","不适用","不适用","不适用","不适用"
|
|
|
@ -1,19 +0,0 @@
|
|||
"任职人员姓名","其他单位名称","在其他单位担任的职务","任期起始日期","任期终止日期"
|
||||
"张良斌","同路生物制药有限公司","副总经理","2006年1月",""
|
||||
"","浙江海康生物制品有限责任公司","董事","2017年2月",""
|
||||
"","广东上量投资有限公司","监事","2016年8月",""
|
||||
"聂申钱","海南中和药业股份有限公司","董事、高级顾问","2016年11月",""
|
||||
"","海南妙峰山健康产业有限公司","执行董事兼总经理","2020年4月",""
|
||||
"邵蓉","中国药科大学","教授、博士生导师","1983年8月",""
|
||||
"","天境生物(I-Mab)","独立董事","2021年6月",""
|
||||
"","江苏当代国安律师事务所","执业律师","2000年8月",""
|
||||
"管建强","华东政法大学","教授、博士生导师","1995年5月",""
|
||||
"","江苏图南合金股份有限公司","独立董事","2018年3月",""
|
||||
"孙红星","上海财经大学","副教授","2021年6月",""
|
||||
"","上海雅运纺织化工股份有限公司","独立董事","2023年7月",""
|
||||
"","苏州世名科技股份有限公司","独立董事","2022年9月",""
|
||||
"黄玲","北京秦脉医药咨询有限责任公司","咨询师","2003年3月",""
|
||||
"程华(离职)","湘财股份有限公司","独立董事","2020年8月",""
|
||||
"","悦康药业集团股份有限公司","独立董事","2019年5月",""
|
||||
"","山东步长制药股份有限公司","独立董事","2021年6月",""
|
||||
"在其他单位任职情况的说明","无","无","无","无"
|
|
|
@ -1,7 +0,0 @@
|
|||
"董事、监事、高级管理人员报酬的决策程序","根据公司章程规定,公司薪酬与考核委员会对董事、高级管理人员的薪酬政策和方案进行研究和审查,高级管理人员的薪酬方案由董事会批准后执行;董事、监事的薪酬方案由董事会、监事会批准后提交股东大会审议通过后执行。"
|
||||
"董事在董事会讨论本人薪酬事项时是否回避","是"
|
||||
"薪酬与考核委员会或独立董事专门会议关于董事、监事、高级管理人员报酬事项发表建议的具体情况","薪酬与考核委员会对董事、高管的薪酬方案和政策均无异议通过。监事会对监事薪酬全体回避,提交股东大会审议,股东大会审议通过。"
|
||||
"董事、监事、高级管理人员报酬确定依据","担任具体职务的董事、监事,根据其在公司的具体任职岗位领取相应薪酬,未在公司任职的非独立董事不在公司领取薪酬和津贴;独立董事、未在公司任职的监事享有固定数额的津贴,随公司工资发放;高级管理人员薪酬由基本薪酬、年终奖金两部分构成,其中基本薪酬系高级管理人员根据职务等级及职责每月领取的,年终奖金根据年度经营及考核情况发放。"
|
||||
"董事、监事和高级管理人员报酬的实际支付情况","本报告期内,公司董事、监事和高级管理人员报酬的实际支付与公司披露的情况一致"
|
||||
"报告期末全体董事、监事和高级管理人员实际获得的报酬合计","784.17"
|
||||
"报告期末核心技术人员实际获得的报酬合计","169.78"
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue