添加 word 版本代码

This commit is contained in:
吴登数 2024-12-30 17:51:12 +08:00
parent 15e33eadf7
commit dd5ee3722e
126 changed files with 27475 additions and 2 deletions

View File

@ -1,7 +1,5 @@
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
from config import MILVUS_CLIENT
import time
from datetime import datetime, timedelta
def create_partition_by_hour(current_hour):
# 连接到 Milvus 服务器
@ -28,6 +26,7 @@ def create_partition_by_hour(current_hour):
pre_partition.release()
collection.drop_partition(name)
print(f"Partition '{name}' deleted.")
connections.disconnect("default")

3
zzb_data_word/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*.pyc
*.vscode
__pycache__/

8
zzb_data_word/.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/log-day/sec.log" charset="GBK" />
</component>
</project>

View File

@ -0,0 +1,168 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="155">
<item index="0" class="java.lang.String" itemvalue="pandas" />
<item index="1" class="java.lang.String" itemvalue="protobuf" />
<item index="2" class="java.lang.String" itemvalue="decorator" />
<item index="3" class="java.lang.String" itemvalue="TA-Lib" />
<item index="4" class="java.lang.String" itemvalue="websocket-client" />
<item index="5" class="java.lang.String" itemvalue="altgraph" />
<item index="6" class="java.lang.String" itemvalue="tzlocal" />
<item index="7" class="java.lang.String" itemvalue="Babel" />
<item index="8" class="java.lang.String" itemvalue="testpath" />
<item index="9" class="java.lang.String" itemvalue="pickleshare" />
<item index="10" class="java.lang.String" itemvalue="psycopg2" />
<item index="11" class="java.lang.String" itemvalue="defusedxml" />
<item index="12" class="java.lang.String" itemvalue="lml" />
<item index="13" class="java.lang.String" itemvalue="PyQt5-sip" />
<item index="14" class="java.lang.String" itemvalue="javascripthon" />
<item index="15" class="java.lang.String" itemvalue="ipython-genutils" />
<item index="16" class="java.lang.String" itemvalue="tables" />
<item index="17" class="java.lang.String" itemvalue="rqdatac" />
<item index="18" class="java.lang.String" itemvalue="Pygments" />
<item index="19" class="java.lang.String" itemvalue="PyQt5" />
<item index="20" class="java.lang.String" itemvalue="bleach" />
<item index="21" class="java.lang.String" itemvalue="graphviz" />
<item index="22" class="java.lang.String" itemvalue="jsonschema" />
<item index="23" class="java.lang.String" itemvalue="pywin32" />
<item index="24" class="java.lang.String" itemvalue="qtconsole" />
<item index="25" class="java.lang.String" itemvalue="terminado" />
<item index="26" class="java.lang.String" itemvalue="portalocker" />
<item index="27" class="java.lang.String" itemvalue="Werkzeug" />
<item index="28" class="java.lang.String" itemvalue="aniso8601" />
<item index="29" class="java.lang.String" itemvalue="mxnet" />
<item index="30" class="java.lang.String" itemvalue="jupyter-client" />
<item index="31" class="java.lang.String" itemvalue="QDarkStyle" />
<item index="32" class="java.lang.String" itemvalue="ipykernel" />
<item index="33" class="java.lang.String" itemvalue="nbconvert" />
<item index="34" class="java.lang.String" itemvalue="attrs" />
<item index="35" class="java.lang.String" itemvalue="pefile" />
<item index="36" class="java.lang.String" itemvalue="psutil" />
<item index="37" class="java.lang.String" itemvalue="pyinstaller-hooks-contrib" />
<item index="38" class="java.lang.String" itemvalue="PyQtWebEngine" />
<item index="39" class="java.lang.String" itemvalue="simplejson" />
<item index="40" class="java.lang.String" itemvalue="prettytable" />
<item index="41" class="java.lang.String" itemvalue="jedi" />
<item index="42" class="java.lang.String" itemvalue="helpdev" />
<item index="43" class="java.lang.String" itemvalue="pyqtgraph" />
<item index="44" class="java.lang.String" itemvalue="dukpy" />
<item index="45" class="java.lang.String" itemvalue="futu-api" />
<item index="46" class="java.lang.String" itemvalue="matplotlib" />
<item index="47" class="java.lang.String" itemvalue="humanize" />
<item index="48" class="java.lang.String" itemvalue="PyMySQL" />
<item index="49" class="java.lang.String" itemvalue="msgpack" />
<item index="50" class="java.lang.String" itemvalue="idna" />
<item index="51" class="java.lang.String" itemvalue="rsa" />
<item index="52" class="java.lang.String" itemvalue="vnstation" />
<item index="53" class="java.lang.String" itemvalue="pandocfilters" />
<item index="54" class="java.lang.String" itemvalue="numpy" />
<item index="55" class="java.lang.String" itemvalue="pyasn1" />
<item index="56" class="java.lang.String" itemvalue="requests" />
<item index="57" class="java.lang.String" itemvalue="pyrsistent" />
<item index="58" class="java.lang.String" itemvalue="gluoncv" />
<item index="59" class="java.lang.String" itemvalue="jdcal" />
<item index="60" class="java.lang.String" itemvalue="jupyter" />
<item index="61" class="java.lang.String" itemvalue="seaborn" />
<item index="62" class="java.lang.String" itemvalue="zipp" />
<item index="63" class="java.lang.String" itemvalue="prompt-toolkit" />
<item index="64" class="java.lang.String" itemvalue="tigeropen" />
<item index="65" class="java.lang.String" itemvalue="itsdangerous" />
<item index="66" class="java.lang.String" itemvalue="pyee" />
<item index="67" class="java.lang.String" itemvalue="deap" />
<item index="68" class="java.lang.String" itemvalue="websockets" />
<item index="69" class="java.lang.String" itemvalue="ipywidgets" />
<item index="70" class="java.lang.String" itemvalue="scipy" />
<item index="71" class="java.lang.String" itemvalue="tornado" />
<item index="72" class="java.lang.String" itemvalue="pyppeteer" />
<item index="73" class="java.lang.String" itemvalue="Send2Trash" />
<item index="74" class="java.lang.String" itemvalue="et-xmlfile" />
<item index="75" class="java.lang.String" itemvalue="incremental" />
<item index="76" class="java.lang.String" itemvalue="mistune" />
<item index="77" class="java.lang.String" itemvalue="cnocr" />
<item index="78" class="java.lang.String" itemvalue="future" />
<item index="79" class="java.lang.String" itemvalue="mpmath" />
<item index="80" class="java.lang.String" itemvalue="jupyter-console" />
<item index="81" class="java.lang.String" itemvalue="macropy3" />
<item index="82" class="java.lang.String" itemvalue="pycryptodome" />
<item index="83" class="java.lang.String" itemvalue="pytz" />
<item index="84" class="java.lang.String" itemvalue="setproctitle" />
<item index="85" class="java.lang.String" itemvalue="webencodings" />
<item index="86" class="java.lang.String" itemvalue="Pillow" />
<item index="87" class="java.lang.String" itemvalue="Twisted" />
<item index="88" class="java.lang.String" itemvalue="traitlets" />
<item index="89" class="java.lang.String" itemvalue="Automat" />
<item index="90" class="java.lang.String" itemvalue="pywinpty" />
<item index="91" class="java.lang.String" itemvalue="python-dateutil" />
<item index="92" class="java.lang.String" itemvalue="Brotli" />
<item index="93" class="java.lang.String" itemvalue="Click" />
<item index="94" class="java.lang.String" itemvalue="cycler" />
<item index="95" class="java.lang.String" itemvalue="MarkupSafe" />
<item index="96" class="java.lang.String" itemvalue="twisted-iocpsupport" />
<item index="97" class="java.lang.String" itemvalue="constantly" />
<item index="98" class="java.lang.String" itemvalue="mongoengine" />
<item index="99" class="java.lang.String" itemvalue="appdirs" />
<item index="100" class="java.lang.String" itemvalue="docopt" />
<item index="101" class="java.lang.String" itemvalue="ibapi" />
<item index="102" class="java.lang.String" itemvalue="pymssql" />
<item index="103" class="java.lang.String" itemvalue="pyzmq" />
<item index="104" class="java.lang.String" itemvalue="certifi" />
<item index="105" class="java.lang.String" itemvalue="entrypoints" />
<item index="106" class="java.lang.String" itemvalue="peewee" />
<item index="107" class="java.lang.String" itemvalue="pyparsing" />
<item index="108" class="java.lang.String" itemvalue="sympy" />
<item index="109" class="java.lang.String" itemvalue="notebook" />
<item index="110" class="java.lang.String" itemvalue="hyperlink" />
<item index="111" class="java.lang.String" itemvalue="win-unicode-console" />
<item index="112" class="java.lang.String" itemvalue="kiwisolver" />
<item index="113" class="java.lang.String" itemvalue="zope.interface" />
<item index="114" class="java.lang.String" itemvalue="APScheduler" />
<item index="115" class="java.lang.String" itemvalue="backcall" />
<item index="116" class="java.lang.String" itemvalue="PySocks" />
<item index="117" class="java.lang.String" itemvalue="widgetsnbextension" />
<item index="118" class="java.lang.String" itemvalue="numexpr" />
<item index="119" class="java.lang.String" itemvalue="pyecharts-snapshot" />
<item index="120" class="java.lang.String" itemvalue="jupyter-core" />
<item index="121" class="java.lang.String" itemvalue="pyecharts-jupyter-installer" />
<item index="122" class="java.lang.String" itemvalue="Delorean" />
<item index="123" class="java.lang.String" itemvalue="SQLAlchemy" />
<item index="124" class="java.lang.String" itemvalue="wcwidth" />
<item index="125" class="java.lang.String" itemvalue="importlib-metadata" />
<item index="126" class="java.lang.String" itemvalue="Jinja2" />
<item index="127" class="java.lang.String" itemvalue="simplegeneric" />
<item index="128" class="java.lang.String" itemvalue="stomp.py" />
<item index="129" class="java.lang.String" itemvalue="pywin32-ctypes" />
<item index="130" class="java.lang.String" itemvalue="pyecharts" />
<item index="131" class="java.lang.String" itemvalue="urllib3" />
<item index="132" class="java.lang.String" itemvalue="Flask" />
<item index="133" class="java.lang.String" itemvalue="coverage" />
<item index="134" class="java.lang.String" itemvalue="pyinstaller" />
<item index="135" class="java.lang.String" itemvalue="pymongo" />
<item index="136" class="java.lang.String" itemvalue="six" />
<item index="137" class="java.lang.String" itemvalue="parso" />
<item index="138" class="java.lang.String" itemvalue="pytesseract" />
<item index="139" class="java.lang.String" itemvalue="nbformat" />
<item index="140" class="java.lang.String" itemvalue="ipython" />
<item index="141" class="java.lang.String" itemvalue="jqdatasdk" />
<item index="142" class="java.lang.String" itemvalue="python-rapidjson" />
<item index="143" class="java.lang.String" itemvalue="packaging" />
<item index="144" class="java.lang.String" itemvalue="pyecharts-javascripthon" />
<item index="145" class="java.lang.String" itemvalue="prometheus-client" />
<item index="146" class="java.lang.String" itemvalue="jupyter-echarts-pypkg" />
<item index="147" class="java.lang.String" itemvalue="chardet" />
<item index="148" class="java.lang.String" itemvalue="tqdm" />
<item index="149" class="java.lang.String" itemvalue="thriftpy2" />
<item index="150" class="java.lang.String" itemvalue="colorama" />
<item index="151" class="java.lang.String" itemvalue="vnpy" />
<item index="152" class="java.lang.String" itemvalue="ply" />
<item index="153" class="java.lang.String" itemvalue="Flask-RESTful" />
<item index="154" class="java.lang.String" itemvalue="openpyxl" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.11" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/zzb_data.iml" filepath="$PROJECT_DIR$/.idea/zzb_data.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@ -0,0 +1,281 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="">
<change afterPath="$PROJECT_DIR$/000593.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/000593.pdf" afterDir="false" />
<change afterPath="$PROJECT_DIR$/app_word.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/db_service_word.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/file/docx/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/main_word.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/test.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/word_title.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/zzb_logger.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.vscode/launch.json" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config.py" beforeDir="false" afterPath="$PROJECT_DIR$/config.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config_init.py" beforeDir="false" afterPath="$PROJECT_DIR$/config_init.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/db_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/db_service.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" beforeDir="false" afterPath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/file/docx/test.txt" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/parse_word/parse_word.py" beforeDir="false" afterPath="$PROJECT_DIR$/parse_word/parse_word.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pdf_title.py" beforeDir="false" afterPath="$PROJECT_DIR$/pdf_title.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/redis_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/redis_service.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
<change beforePath="$PROJECT_DIR$/test_process.py" beforeDir="false" afterPath="$PROJECT_DIR$/test_process.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FlaskConsoleOptions" custom-start-script="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
<envs>
<env key="FLASK_APP" value="app" />
</envs>
<option name="myCustomStartScript" value="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
<option name="myEnvs">
<map>
<entry key="FLASK_APP" value="app" />
</map>
</option>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 7
}</component>
<component name="ProjectId" id="2mTMc5iMC8X5mnsBHls6IKXwyDz" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">{
&quot;keyToString&quot;: {
&quot;Python.app.executor&quot;: &quot;Run&quot;,
&quot;Python.app_word.executor&quot;: &quot;Run&quot;,
&quot;Python.config_init.executor&quot;: &quot;Run&quot;,
&quot;Python.db_service.executor&quot;: &quot;Debug&quot;,
&quot;Python.db_service_word.executor&quot;: &quot;Debug&quot;,
&quot;Python.main_word.executor&quot;: &quot;Debug&quot;,
&quot;Python.parse_word.executor&quot;: &quot;Run&quot;,
&quot;Python.pdf_title.executor&quot;: &quot;Run&quot;,
&quot;Python.redis_service.executor&quot;: &quot;Run&quot;,
&quot;Python.test.executor&quot;: &quot;Run&quot;,
&quot;Python.test_process.executor&quot;: &quot;Run&quot;,
&quot;Python.zzb_logger.executor&quot;: &quot;Run&quot;,
&quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
&quot;git-widget-placeholder&quot;: &quot;pdf-0914(测试环境)&quot;,
&quot;ignore.virus.scanning.warn.message&quot;: &quot;true&quot;,
&quot;last_opened_file_path&quot;: &quot;C:/Users/45272/OneDrive/Documents/work/code/zzb_data_word/parse_word&quot;,
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
&quot;settings.editor.selected.configurable&quot;: &quot;com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable&quot;,
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
}
}</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word\parse_word" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\parse_word" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\file\docx" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data" />
</key>
</component>
<component name="RunManager" selected="Python.app_word">
<configuration name="app_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="main_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/main_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="parse_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/parse_word" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/parse_word/parse_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="zzb_logger" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/zzb_logger.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.app_word" />
<item itemvalue="Python.zzb_logger" />
<item itemvalue="Python.test" />
<item itemvalue="Python.parse_word" />
<item itemvalue="Python.main_word" />
</list>
</recent_temporary>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-python-sdk-5a2391486177-2887949eec09-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.13763.11" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="" />
<created>1727096188853</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1727096188853</updated>
<workItem from="1727096189919" duration="4242000" />
<workItem from="1727226275743" duration="13996000" />
<workItem from="1727406650674" duration="9011000" />
<workItem from="1727574308954" duration="28121000" />
<workItem from="1728458648396" duration="3371000" />
<workItem from="1728462140132" duration="473000" />
<workItem from="1728462643998" duration="20841000" />
<workItem from="1728544515382" duration="4091000" />
<workItem from="1728557155319" duration="4071000" />
<workItem from="1728607689751" duration="6705000" />
<workItem from="1728868463278" duration="598000" />
<workItem from="1728953453192" duration="2839000" />
<workItem from="1728958252539" duration="12021000" />
<workItem from="1729042469650" duration="17683000" />
<workItem from="1729213219892" duration="15109000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/zzb_data$app.coverage" NAME="app Coverage Results" MODIFIED="1727226379705" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$main_word.coverage" NAME="main_word Coverage Results" MODIFIED="1728366719918" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$redis_service.coverage" NAME="redis_service Coverage Results" MODIFIED="1728537921801" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$test.coverage" NAME="test Coverage Results" MODIFIED="1729216810415" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$config_init.coverage" NAME="config_init Coverage Results" MODIFIED="1728540429755" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$test_process.coverage" NAME="test_process Coverage Results" MODIFIED="1728545660471" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$pdf_title.coverage" NAME="pdf_title Coverage Results" MODIFIED="1727243043393" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$db_service_word.coverage" NAME="db_service_word Coverage Results" MODIFIED="1727619004690" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$zzb_logger.coverage" NAME="zzb_logger Coverage Results" MODIFIED="1729237015669" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$db_service.coverage" NAME="db_service Coverage Results" MODIFIED="1727572268056" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1728569829164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
<SUITE FILE_PATH="coverage/zzb_data$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1728569456711" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1729238946419" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1729064030098" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
</component>
</project>

View File

@ -0,0 +1,298 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="">
<change afterPath="$PROJECT_DIR$/000593.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/000593.pdf" afterDir="false" />
<change afterPath="$PROJECT_DIR$/101.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/app_word.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/db_service_word.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/file/docx/西部建设.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/file/docx/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/main_word.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/test.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/word_title.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/zzb_logger.py" afterDir="false" />
<change afterPath="$PROJECT_DIR$/西部建设.docx" afterDir="false" />
<change afterPath="$PROJECT_DIR$/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.vscode/launch.json" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config.py" beforeDir="false" afterPath="$PROJECT_DIR$/config.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/config_init.py" beforeDir="false" afterPath="$PROJECT_DIR$/config_init.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/db_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/db_service.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" beforeDir="false" afterPath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/file/docx/test.txt" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/parse_word/parse_word.py" beforeDir="false" afterPath="$PROJECT_DIR$/parse_word/parse_word.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/pdf_title.py" beforeDir="false" afterPath="$PROJECT_DIR$/pdf_title.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/redis_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/redis_service.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
<change beforePath="$PROJECT_DIR$/test_process.py" beforeDir="false" afterPath="$PROJECT_DIR$/test_process.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FlaskConsoleOptions" custom-start-script="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
<envs>
<env key="FLASK_APP" value="app" />
</envs>
<option name="myCustomStartScript" value="import sys&#10;sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS])&#10;from flask.cli import ScriptInfo&#10;locals().update(ScriptInfo(create_app=None).load_app().make_shell_context())&#10;print(&quot;Python %s on %s\nApp: %s [%s]\nInstance: %s&quot; % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
<option name="myEnvs">
<map>
<entry key="FLASK_APP" value="app" />
</map>
</option>
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 7
}</component>
<component name="ProjectId" id="2mTMc5iMC8X5mnsBHls6IKXwyDz" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">{
&quot;keyToString&quot;: {
&quot;Python.app.executor&quot;: &quot;Run&quot;,
&quot;Python.app_word.executor&quot;: &quot;Run&quot;,
&quot;Python.config_init.executor&quot;: &quot;Run&quot;,
&quot;Python.db_service.executor&quot;: &quot;Debug&quot;,
&quot;Python.db_service_word.executor&quot;: &quot;Debug&quot;,
&quot;Python.main_word.executor&quot;: &quot;Debug&quot;,
&quot;Python.parse_word.executor&quot;: &quot;Run&quot;,
&quot;Python.pdf_title.executor&quot;: &quot;Run&quot;,
&quot;Python.redis_service.executor&quot;: &quot;Run&quot;,
&quot;Python.test.executor&quot;: &quot;Run&quot;,
&quot;Python.test_process.executor&quot;: &quot;Run&quot;,
&quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
&quot;git-widget-placeholder&quot;: &quot;pdf-0914(测试环境)&quot;,
&quot;ignore.virus.scanning.warn.message&quot;: &quot;true&quot;,
&quot;last_opened_file_path&quot;: &quot;C:/Users/45272/OneDrive/Documents/work/code/mars_2.0.1 - income&quot;,
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
&quot;settings.editor.selected.configurable&quot;: &quot;com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable&quot;,
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
}
}</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word\file\docx" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word\parse_word" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\parse_word" />
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\file\docx" />
</key>
<key name="MoveFile.RECENT_KEYS">
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word" />
</key>
</component>
<component name="RunManager" selected="Python.app_word">
<configuration name="app_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="main_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/main_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="parse_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/parse_word" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/parse_word/parse_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test_process" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test_process.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.app_word" />
<item itemvalue="Python.test" />
<item itemvalue="Python.parse_word" />
<item itemvalue="Python.main_word" />
<item itemvalue="Python.test_process" />
</list>
</recent_temporary>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-python-sdk-5a2391486177-2887949eec09-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.13763.11" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="" />
<created>1727096188853</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1727096188853</updated>
<workItem from="1727096189919" duration="4242000" />
<workItem from="1727226275743" duration="13996000" />
<workItem from="1727406650674" duration="9011000" />
<workItem from="1727574308954" duration="28121000" />
<workItem from="1728458648396" duration="3371000" />
<workItem from="1728462140132" duration="473000" />
<workItem from="1728462643998" duration="20841000" />
<workItem from="1728544515382" duration="4091000" />
<workItem from="1728557155319" duration="4071000" />
<workItem from="1728607689751" duration="6705000" />
<workItem from="1728868463278" duration="598000" />
<workItem from="1728953453192" duration="2839000" />
<workItem from="1728958252539" duration="12021000" />
<workItem from="1729042469650" duration="17683000" />
<workItem from="1729213219892" duration="9267000" />
<workItem from="1729484773560" duration="4727000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/main_word.py</url>
<line>87</line>
<option name="timeStamp" value="8" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/zzb_data$app.coverage" NAME="app Coverage Results" MODIFIED="1727226379705" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$main_word.coverage" NAME="main_word Coverage Results" MODIFIED="1728366719918" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$redis_service.coverage" NAME="redis_service Coverage Results" MODIFIED="1728537921801" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$test.coverage" NAME="test Coverage Results" MODIFIED="1729495516957" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$config_init.coverage" NAME="config_init Coverage Results" MODIFIED="1728540429755" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$test_process.coverage" NAME="test_process Coverage Results" MODIFIED="1728545660471" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$pdf_title.coverage" NAME="pdf_title Coverage Results" MODIFIED="1727243043393" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$db_service_word.coverage" NAME="db_service_word Coverage Results" MODIFIED="1727619004690" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$db_service.coverage" NAME="db_service Coverage Results" MODIFIED="1727572268056" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1728569829164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
<SUITE FILE_PATH="coverage/zzb_data$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1728569456711" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1729587252480" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data_word$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1729257457108" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
</component>
</project>

View File

@ -0,0 +1,186 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="">
<change afterPath="$PROJECT_DIR$/app_word.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/.vscode/launch.json" beforeDir="false" />
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
<change beforePath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" beforeDir="false" afterPath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" afterDir="false" />
<change beforePath="$PROJECT_DIR$/parse_word/parse_word.py" beforeDir="false" afterPath="$PROJECT_DIR$/parse_word/parse_word.py" afterDir="false" />
</list>
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="Git.Settings">
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 7
}</component>
<component name="ProjectId" id="2mTMc5iMC8X5mnsBHls6IKXwyDz" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">{
&quot;keyToString&quot;: {
&quot;Python.app.executor&quot;: &quot;Run&quot;,
&quot;Python.app_word.executor&quot;: &quot;Run&quot;,
&quot;Python.parse_word.executor&quot;: &quot;Run&quot;,
&quot;Python.pdf_title.executor&quot;: &quot;Run&quot;,
&quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
&quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;,
&quot;git-widget-placeholder&quot;: &quot;pdf-0914(测试环境)&quot;,
&quot;last_opened_file_path&quot;: &quot;C:/Users/45272/OneDrive/Documents/work/code/zzb_data&quot;,
&quot;node.js.detected.package.eslint&quot;: &quot;true&quot;,
&quot;node.js.detected.package.tslint&quot;: &quot;true&quot;,
&quot;node.js.selected.package.eslint&quot;: &quot;(autodetect)&quot;,
&quot;node.js.selected.package.tslint&quot;: &quot;(autodetect)&quot;,
&quot;nodejs_package_manager_path&quot;: &quot;npm&quot;,
&quot;vue.rearranger.settings.migration&quot;: &quot;true&quot;
}
}</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data" />
</key>
</component>
<component name="RunManager" selected="Python.app_word">
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="app_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="parse_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/parse_word" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/parse_word/parse_word.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="pdf_title" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="zzb_data" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/pdf_title.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.app_word" />
<item itemvalue="Python.parse_word" />
<item itemvalue="Python.pdf_title" />
<item itemvalue="Python.app" />
</list>
</recent_temporary>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-js-predefined-1d06a55b98c1-91d5c284f522-JavaScript-PY-241.15989.155" />
<option value="bundled-python-sdk-babbdf50b680-7c6932dee5e4-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.15989.155" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="" />
<created>1727096188853</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1727096188853</updated>
<workItem from="1727096189919" duration="4242000" />
<workItem from="1727226275743" duration="13996000" />
<workItem from="1727406650674" duration="9304000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/zzb_data$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1727421672403" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
<SUITE FILE_PATH="coverage/zzb_data$app.coverage" NAME="app Coverage Results" MODIFIED="1727226379705" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$pdf_title.coverage" NAME="pdf_title Coverage Results" MODIFIED="1727243043393" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
<SUITE FILE_PATH="coverage/zzb_data$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1727422680153" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
</component>
</project>

BIN
zzb_data_word/1.docx Normal file

Binary file not shown.

BIN
zzb_data_word/101.docx Normal file

Binary file not shown.

BIN
zzb_data_word/102.docx Normal file

Binary file not shown.

BIN
zzb_data_word/103.docx Normal file

Binary file not shown.

207
zzb_data_word/DB_Trans.py Normal file
View File

@ -0,0 +1,207 @@
import pymssql
import mysql.connector
import logging
from multiprocessing import Pool
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# SQL Server配置
sql_server_config = {
"server": "203.192.15.17", # SQL Server 的 IP 地址
"port": 28063, # SQL Server 的端口
"user": "zncbuser", # 用户名
"password": "ZZB-Cbindex-data", # 密码
"database": "jydb", # 数据库名称
}
# MySQL配置
mysql_config = {
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com", # MySQL 的 IP 地址
"user": "zzb_jydb", # 用户名
"password": "Ysdbsdjs89Yrqwp", # 密码
"database": "zzb_jydb", # 数据库名称
}
# 分批大小(每次查询和插入的行数)
BATCH_SIZE = 100000
# 最大进程数
MAX_PROCESSES = 1
def sync_table(table_name):
try:
# 连接到SQL Server
sql_server_conn = pymssql.connect(
server=sql_server_config["server"],
port=sql_server_config["port"],
user=sql_server_config["user"],
password=sql_server_config["password"],
database=sql_server_config["database"],
)
sql_server_cursor = sql_server_conn.cursor()
# 连接到MySQL
mysql_conn = mysql.connector.connect(**mysql_config)
mysql_cursor = mysql_conn.cursor()
logging.info(f"Processing table: {table_name}")
# 检查MySQL中是否已存在该表
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
table_exists = mysql_cursor.fetchone()
if not table_exists:
# 如果表不存在,创建表
sql_server_cursor.execute(f"""
SELECT
COLUMN_NAME,
DATA_TYPE,
CHARACTER_MAXIMUM_LENGTH,
NUMERIC_PRECISION,
NUMERIC_SCALE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = '{table_name}'
""")
columns = sql_server_cursor.fetchall()
# 生成MySQL的CREATE TABLE语句
create_table_sql = f"CREATE TABLE {table_name} ("
for col in columns:
col_name = col[0]
col_type = col[1]
# 获取字段长度
char_length = col[2]
numeric_precision = col[3]
numeric_scale = col[4]
# 简单类型映射(可能需要根据实际情况调整)
if col_type == "varchar":
col_type = "VARCHAR(255)"
elif col_type == "int":
col_type = "INT"
elif col_type == "datetime":
col_type = "DATETIME"
elif col_type == "decimal":
if numeric_precision and numeric_scale:
col_type = f"DECIMAL({numeric_precision}, {numeric_scale})"
else:
col_type = "DECIMAL(10, 2)" # 默认值
elif col_type == "money":
col_type = "DECIMAL(19, 4)"
elif col_type == "smallmoney":
col_type = "DECIMAL(19, 4)"
elif col_type == "image":
col_type = "LONGBLOB"
# 设置列的 NULL 属性
if col_name.lower() == "id":
# ID 列不允许 NULL
create_table_sql += f"`{col_name}` {col_type} NOT NULL, "
else:
# 其他列允许 NULL
create_table_sql += f"`{col_name}` {col_type} , "
# 添加主键约束(假设 ID 是主键)
create_table_sql = create_table_sql.rstrip(", ") + f", PRIMARY KEY ({columns[0][0]}))"
logging.info(f"Create table SQL: {create_table_sql}")
# 在MySQL中创建表
mysql_cursor.execute(create_table_sql)
logging.info(f"Table {table_name} created in MySQL.")
else:
logging.info(f"Table {table_name} already exists in MySQL. Updating data...")
# 获取表的列信息
sql_server_cursor.execute(f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}'")
columns = sql_server_cursor.fetchall()
# 获取目标数据库中该表的 id 最大值
mysql_cursor.execute(f"SELECT MAX({columns[0][0]}) FROM {table_name}")
max_id = mysql_cursor.fetchone()[0]
if max_id is None:
max_id = 0 # 如果表中没有数据,设置 max_id 为 0
logging.info(f"Target table {table_name} has max ID: {max_id}")
# 获取SQL Server中的数据分批查询
offset = 0
while True:
# 使用 ROW_NUMBER() 实现分页查询
sql_server_cursor.execute(f"""
SELECT * FROM (
SELECT *, ROW_NUMBER() OVER (ORDER BY {columns[0][0]}) AS RowNum
FROM {table_name}
WHERE {columns[0][0]} > {max_id}
) AS SubQuery
WHERE RowNum BETWEEN {offset + 1} AND {offset + BATCH_SIZE}
""")
rows = sql_server_cursor.fetchall()
if not rows:
logging.info(f"表:{table_name} 数据已经是最新的,不需要更新")
break # 如果没有数据了,退出循环
insert_values = [row[:-1] for row in rows]
# 批量插入数据
if insert_values:
# 动态生成插入语句的列名和占位符
placeholders = ", ".join(["%s"] * len(insert_values[0]))
columns_list = ", ".join([col[0] for col in columns])
insert_sql = f"INSERT INTO {table_name} ({columns_list}) VALUES ({placeholders})"
# 执行批量插入
try:
mysql_cursor.executemany(insert_sql, insert_values)
mysql_conn.commit()
logging.info(f"Inserted {len(insert_values)} rows into {table_name}.")
except mysql.connector.errors.DataError as e:
logging.error(f"DataError: {e}")
mysql_conn.rollback()
offset += BATCH_SIZE
logging.info(f"Processed {offset} rows in {table_name}...")
# 关闭连接
sql_server_cursor.close()
sql_server_conn.close()
mysql_cursor.close()
mysql_conn.close()
logging.info(f"Sync completed for table: {table_name}")
except Exception as e:
logging.error(f"Failed to sync table {table_name}. Error: {e}")
def main():
try:
# 连接到SQL Server
sql_server_conn = pymssql.connect(
server=sql_server_config["server"],
port=sql_server_config["port"],
user=sql_server_config["user"],
password=sql_server_config["password"],
database=sql_server_config["database"],
)
sql_server_cursor = sql_server_conn.cursor()
# 获取SQL Server中的所有表
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
tables = sql_server_cursor.fetchall()
# 使用进程池并发处理每个表
with Pool(processes=MAX_PROCESSES) as pool:
pool.map(sync_table, [table[0] for table in tables])
logging.info("All tables synced successfully!")
except Exception as e:
logging.error(f"Main function failed. Error: {e}")
finally:
# 关闭连接
if 'sql_server_cursor' in locals():
sql_server_cursor.close()
if 'sql_server_conn' in locals():
sql_server_conn.close()
# 启动主函数
if __name__ == "__main__":
main()

674
zzb_data_word/LICENSE Normal file
View File

@ -0,0 +1,674 @@
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.

35
zzb_data_word/Mil_test.py Normal file
View File

@ -0,0 +1,35 @@
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection ,utility
#测试环境建好了 114.55.115.191:19530
# 连接到 Milvus
#connections.connect("default", host="114.55.115.191", port="19530")#本地测试
#connections.connect("default", host="124.71.157.162", port="19530")#生产环境
# connections.connect("default", host="124.70.129.232", port="19530")#测试环境
#connections.connect("default", host="1.94.113.19", port="19530")#测试环境13
connections.connect("default", host="1.94.60.103", port="19530")#测试环境103
#connections.connect("default", host="192.168.0.129", port="19530")
collections = utility.list_collections()
#生产1.94.179.121
#测试114.55.115.191
# 输出集合列表
print("Collections in Milvus:")
for collection in collections:
print(collection)
collection = Collection(name='pdf_measure_v4')
# 获取集合的详细信息
#print(f"Collection name: {collection.name}")
print(f"Collection schema: {collection.schema}")
print(f"Number of entities in collection: {collection.num_entities}")
search_vectors = collection.load()
entities = collection.query(expr="file_id == '39369'", output_fields=["table_num","table_index","measure_name","measure_value","measure_unit","file_id"],limit = 10)
count = 0
# # # 输出集合中的数据
print("Data in collection:")
for entity in entities:
print(entity)
count += 1
print(f'这个条件下数据量为{count}')
connections.disconnect('default')

69
zzb_data_word/Mil_unit.py Normal file
View File

@ -0,0 +1,69 @@
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
from config import MILVUS_CLIENT
import time
from datetime import datetime, timedelta
def create_partition_by_hour(current_hour):
# 连接到 Milvus 服务器
connections.connect("default",uri=MILVUS_CLIENT)
# 获取集合
collection_name = "pdf_measure_v4"
collection = Collection(collection_name)
# 创建当前小时的分区
partition_name = f"partition_{current_hour}"
if not collection.has_partition(partition_name):
collection.create_partition(partition_name)
print(f"Created partition: {partition_name}")
partition = collection.partition(partition_name)
partition.load()
# 获取所有分区
partitions = collection.partitions
# 删除所有分区(除了默认分区和当前分区)
for partition in partitions:
name = partition.name
if name not in ["_default", partition_name]: # 保留默认分区
pre_partition = collection.partition(name)
pre_partition.release()
collection.drop_partition(name)
print(f"Partition '{name}' deleted.")
from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
# 连接到 B 服务器上的 Milvus
# connections.connect(host='124.70.129.232', port='19530')# 测试服务器
connections.connect(host='127.0.0.1', port='19530')# 测试服务器
# # 获取集合列表
utility.drop_collection("pdf_measure_v4")
# 定义字段
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
FieldSchema(name="table_num", dtype=DataType.INT16),
FieldSchema(name="table_index", dtype=DataType.INT16),
FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
]
# 定义集合的 schema
schema = CollectionSchema(fields=fields, description="My Milvus collection")
# 创建集合
collection = Collection(name="pdf_measure_v4", schema=schema)
collection = Collection("pdf_measure_v4")
index_params = {
"index_type": "IVF_FLAT",
"metric_type": "COSINE",
"params": {"nlist": 128}
}
collection.create_index(field_name="vector", index_params=index_params)
collection.load()

366
zzb_data_word/app.py Normal file
View File

@ -0,0 +1,366 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import os
import utils
import queue
from multiprocessing import Process,Manager
import pdf_title
import main
import time
import config
import requests
import db_service
import threading
app = FastAPI()
cpu_count = os.cpu_count()
job_queue = queue.Queue()
# 定义请求体模型
class FileItem(BaseModel):
file_path: str
file_id: str
def run_job():
#判断是否有任务在执行
if_run = True
if job_queue.empty():
print(f"job_queue为空:")
if_run = False
if if_run:
job_config = job_queue.get()
page_list = []
file_path = job_config['file_path']
file_id = job_config['file_id']
job_status = True
continue_execution = True
try:
#下载pdf
start_time = time.time()
print(f"开始启动文件解析任务: {file_path}")
if file_path.startswith('http'):
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
try:
file_info = pdf_title.create_text_outline(file_path,file_id)
except Exception as e:
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
print(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}')
print(f"{file_id}运行失败: {e}")
continue_execution = False
if continue_execution:
print(cpu_count)
parent_table_pages = file_info['parent_table_pages']
print('parent_table_pages的值是')
print(parent_table_pages)
# page_nums = [
# '1-3',
# '4-6',
# ]
page_num = file_info['page_count']
if page_num < cpu_count:
p_count = page_num
else :
p_count = cpu_count
for i in range(p_count):
# for i in range(2):
page_list.append({
'type': 'table',
'page_num': file_info['split_parts']['table_split_parts'][i],
# 'page_num': page_nums[i],
'path': file_path,
'file_id': file_id,
'parent_table_pages': parent_table_pages,
'page_count': file_info['page_count'],
'tables_range': {},
})
# 通知开始解析
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
print(f'通知pdf开始解析url:{file_id}:{response.url}')
print(f'通知pdf开始解析状态:{file_id}:{response.text}')
parser_start_time = time.time()
processes = []
time_dispatch_job = time.time()
for job_info in page_list:
p = Process(target=main.dispatch_job, args=(job_info,))
processes.append(p)
p.start()
#time_dispatch_job_end = time.time()
#process_time = time_dispatch_job_end - time_dispatch_job
#db_service.process_time(file_id,'1',process_time)
print('等待所有子任务完成任务ID:', file_id)
for p in processes:
p.join()
print('pdf解析任务完成任务完成任务ID:', file_id)
time_dispatch_job_end = time.time()
process_time = time_dispatch_job_end - time_dispatch_job
db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
parser_end_time = time.time()
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
#这里做一步判断,看看是否还要继续。
if db_service.file_type_check(file_id):
print("文本较真表格生成已结束")
else:
# 通知抽取指标
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
print(f'通知开始抽取指标url:{file_id}:{response.url}')
print(f'通知开始抽取指标状态:{file_id}:{response.text}')
parser_start_time = time.time()
print('开始表格指标抽取任务ID:', file_id)
time_start = time.time()
if db_service.file_type_check_v2(file_id) ==3:#判断是否为3季报
main.start_table_measure_job(file_id)
#time_start_end = time.time()
#process_time = time_start_end - time_start
#db_service.process_time(file_id,'2',process_time)
time_start_end = time.time()
process_time = time_start_end - time_start
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
print('表格指标抽取完成任务ID:', file_id)
parser_end_time = time.time()
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
print('启动这个指标归一化任务ID-修改测试:', file_id)
time_update = time.time()
main.update_measure_data(file_id,file_path,parent_table_pages)
#time_update_end = time.time()
#process_time = time_update_end - time_update
#db_service.process_time(file_id,'3',process_time)
print('归一化完成任务ID:', file_id)
end_time = time.time()
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
time_update_end = time.time()
process_time = time_update_end - time_update
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
else:#不是三季报就直接按照年报和半年报走
main.start_table_measure_job(file_id)
#time_start_end = time.time()
#process_time = time_start_end - time_start
#db_service.process_time(file_id,'2',process_time)
time_start_end = time.time()
process_time = time_start_end - time_start
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
print('表格指标抽取完成任务ID:', file_id)
parser_end_time = time.time()
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
print('启动这个指标归一化任务ID-修改测试:', file_id)
time_update = time.time()
main.update_measure_data(file_id,file_path,parent_table_pages)
#time_update_end = time.time()
#process_time = time_update_end - time_update
#db_service.process_time(file_id,'3',process_time)
print('归一化完成任务ID:', file_id)
end_time = time.time()
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
time_update_end = time.time()
process_time = time_update_end - time_update
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
#通知任务完成
response_time = time.time()
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
print(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}')
response_time_end = time.time()
process_time = response_time_end - response_time
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
except Exception as e:
#通知任务完成
response_time = time.time()
if "integer division or modulo by zero" in str(e):
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id, 'status': 4})
else:
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id, 'status': 4})
#response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
response_time_end = time.time()
process_time = response_time_end - response_time
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
print(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}')
print(f"Response status code: {response.status_code}")
print(f"{file_id}运行失败: {e}")
finally:
print(f"任务 {file_id} 完成,运行状态:{job_status}")
#pdf_company_0824.name_code_fix(file_id,file_path)
#print('公司名与编码填充完毕')
else:
print("有任务运行中,需要等待.....")
def parse_pdf_route(fileItem: FileItem):
# 创建一个队列,保证每次只执行一个文件解析任务
job_queue.put({
'file_path' : fileItem.file_path,
'file_id' : fileItem.file_id
})
print(f"增加 {fileItem.file_id} 到队列.")
threading.Thread(target=run_job, args=()).start()
return {"success": True, "msg": "文件解析开始"}
app.post("/parser/start",
tags=["parser"],
summary="解析Pdf文件",
)(parse_pdf_route)
def run_disclosure():
#判断是否有任务在执行
if_run = True
if job_queue.empty():
print(f"job_queue为空")
if_run = False
if if_run:
job_config = job_queue.get()
page_list = []
file_path = job_config['file_path']
file_id = job_config['file_id']
job_status = True
continue_execution = True
try:
#下载pdf
start_time = time.time()
print(f"开始启动文件解析任务: {file_path}")
print('这里是信披')
if file_path.startswith('http'):
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
try:
file_info = pdf_title.create_text_outline_disclosure(file_path,file_id)
except Exception as e:
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id,'status': 7})
print(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}')
print(f"{file_id}运行失败: {e}")
continue_execution = False
if continue_execution:
print(cpu_count)
parent_table_pages = file_info['parent_table_pages']
print('parent_table_pages的值是')
print(parent_table_pages)
# page_nums = [
# '1-3',
# '4-6',
# ]
print(cpu_count)
print('测试')
page_num = file_info['page_count']
if page_num < cpu_count:
p_count = page_num
else :
p_count = cpu_count
for i in range(p_count):
# for i in range(2):
page_list.append({
'type': 'table',
'page_num': file_info['split_parts']['table_split_parts'][i],
# 'page_num': page_nums[i],
'path': file_path,
'file_id': file_id,
'parent_table_pages': parent_table_pages,
'page_count': file_info['page_count'],
'tables_range': {},
})
# 通知开始解析
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id,'status': 5})
print(f'通知pdf开始解析url:{file_id}:{response.url}')
print(f'通知pdf开始解析状态:{file_id}:{response.text}')
parser_start_time = time.time()
processes = []
time_dispatch_job = time.time()
for job_info in page_list:
p = Process(target=main.dispatch_disclosure, args=(job_info,))
processes.append(p)
p.start()
#time_dispatch_job_end = time.time()
#process_time = time_dispatch_job_end - time_dispatch_job
#db_service.process_time(file_id,'1',process_time)
print('等待所有子任务完成任务ID:', file_id)
for p in processes:
p.join()
print('pdf解析任务完成任务完成任务ID:', file_id)
time_dispatch_job_end = time.time()
process_time = time_dispatch_job_end - time_dispatch_job
#db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
parser_end_time = time.time()
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
#这里做一步判断,看看是否还要继续。
#if db_service.file_type_check(file_id):
print("文本较真表格生成已结束")
#通知任务完成
response_time = time.time()
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id,'status': 1})
print(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}')
response_time_end = time.time()
process_time = response_time_end - response_time
#db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
except Exception as e:
#通知任务完成
response_time = time.time()
if "integer division or modulo by zero" in str(e):
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id, 'status': 4})
else:
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id, 'status': 4})
#response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
response_time_end = time.time()
process_time = response_time_end - response_time
#db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
print(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}')
print(f"Response status code: {response.status_code}")
print(f"{file_id}运行失败: {e}")
finally:
print(f"任务 {file_id} 完成,运行状态:{job_status}")
#pdf_company_0824.name_code_fix(file_id,file_path)
#print('公司名与编码填充完毕')
else:
print("有任务运行中,需要等待.....")
#信披文件解析
def disclosure(fileItem: FileItem):
# 创建一个队列,保证每次只执行一个文件解析任务
job_queue.put({
'file_path' : fileItem.file_path,
'file_id' : fileItem.file_id
})
print(f"增加 {fileItem.file_id} 到队列.")
threading.Thread(target=run_disclosure, args=()).start()
return {"success": True, "msg": "文件解析开始"}
app.post("/parser/disclosure",
tags=["parser"],
summary="信披文件解析",
)(disclosure)
# 运行 FastAPI 应用
if __name__ == "__main__":
# 服务器启动服务
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
# 本地调试任务
#job_queue.put({
#'file_path' : '6281.pdf',
#'file_id' : '6281'
#})
#run_job()

226
zzb_data_word/app_word.py Normal file
View File

@ -0,0 +1,226 @@
from fastapi import FastAPI
from pydantic import BaseModel
import os
import utils
import queue
import multiprocessing
from multiprocessing import Process
import word_title
import time
import config
import requests
import threading
from parse_word import parse_docx, split_text_table
import json
import db_service_word
import main_word
from zzb_logger import applog
app = FastAPI()
cpu_count = os.cpu_count()
job_queue = queue.Queue()
# 定义请求体模型
class FileItem(BaseModel):
file_path: str
file_id: str
def split_list(lst, n):
k, m = divmod(len(lst), n)
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
def run_job():
#判断是否有任务在执行
if_run = True
if job_queue.empty():
applog.info(f"job_queue为空:")
if_run = False
if if_run:
job_config = job_queue.get()
file_path = job_config['file_path']
file_id = job_config['file_id']
continue_execution = True
try:
start_time = time.time()
applog.info(f"开始启动文件解析任务: {file_path}")
if file_path.startswith('http'):
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
try:
time_dispatch_job = time.time()
# 通知开始解析 暂时不通知
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
applog.info(f'通知pdf开始解析url:{file_id}:{response.url}')
applog.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
parsed_content, catalog_content = parse_docx(file_path) # catalog_content 目录需要写入数据库
json_parsed_content = json.loads(parsed_content)
json_catalog_content = json.loads(catalog_content)
db_service_word.word_title_insert_mysql(file_id, json_catalog_content)
parent_table_pages = word_title.get_parent_table_pages(json_catalog_content,file_id)
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
#
processes = []
text_list = split_list(json.loads(text_elements_json), cpu_count)
applog.info(f'text任务ID:{file_id}')
for job_info in text_list:
p = Process(target=main_word.process_text_content, args=(file_id, job_info,json.loads(table_elements_json),json.loads(text_elements_json)))
processes.append(p)
p.start()
applog.info(f'等待所有子任务完成任务ID:{file_id}')
for p in processes:
p.join()
applog.info(f'word表格中 text解析完成任务ID:{file_id}',)
processes = []
table_list = split_list(json.loads(table_elements_json), cpu_count)
applog.info(f'开始解析word表表格中的table任务ID:{file_id}')
for job_info in table_list:
p = Process(target=main_word.process_table, args=(file_id, job_info,))
processes.append(p)
p.start()
applog.info(f'等待所有子任务完成任务ID:{file_id}')
for p in processes:
p.join()
# main_word.process_table(file_id, json.loads(table_elements_json))
applog.info(f'word表格中 table解析完成任务ID:{file_id}')
time_dispatch_job_end = time.time()
process_time = time_dispatch_job_end - time_dispatch_job
db_service_word.process_time(file_id, '1', process_time, time_dispatch_job, time_dispatch_job_end)
parser_end_time = time.time()
applog.info(f"解析任务 {file_id} 完成,耗时{(parser_end_time - time_dispatch_job):.2f} 秒。")
except Exception as e:
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
applog.info(f'通知任务状态url:{file_id}:{response.url}')
applog.info(f'通知任务状态任务:{file_id}:{response.text}')
applog.info(f"{file_id}运行失败: {e}")
continue_execution = False
if continue_execution :
#这里做一步判断,看看是否还要继续。
if db_service_word.file_type_check(file_id):
applog.info("文本较真表格生成已结束")
else:
# 通知抽取指标---------------------------------
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
applog.info(f'通知开始抽取指标url:{file_id}:{response.url}')
applog.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
parser_start_time = time.time()
applog.info(f'开始表格指标抽取任务ID:{file_id}')
time_start = time.time()
if db_service_word.file_type_check_v2(file_id) == 3 : #判断是否为3季报
main_word.start_table_measure_job(file_id)
#time_start_end = time.time()
#process_time = time_start_end - time_start
#db_service.process_time(file_id,'2',process_time)
time_start_end = time.time()
process_time = time_start_end - time_start
db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end)
applog.info(f'表格指标抽取完成任务ID:{file_id}')
parser_end_time = time.time()
applog.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
time_update = time.time()
main_word.update_measure_data(file_id,file_path,parent_table_pages)
#time_update_end = time.time()
#process_time = time_update_end - time_update
#db_service.process_time(file_id,'3',process_time)
applog.info(f'归一化完成任务ID:{file_id}')
end_time = time.time()
applog.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
time_update_end = time.time()
process_time = time_update_end - time_update
db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end)
else:#不是三季报就直接按照年报和半年报走
main_word.start_table_measure_job(file_id)
#time_start_end = time.time()
#process_time = time_start_end - time_start
#db_service.process_time(file_id,'2',process_time)
time_start_end = time.time()
process_time = time_start_end - time_start
db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end)
applog.info(f'表格指标抽取完成任务ID:{file_id}' )
parser_end_time = time.time()
applog.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}' )
time_update = time.time()
main_word.update_measure_data(file_id,file_path,parent_table_pages)
#time_update_end = time.time()
#process_time = time_update_end - time_update
#db_service.process_time(file_id,'3',process_time)
applog.info(f'归一化完成任务ID:{file_id}')
end_time = time.time()
applog.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
time_update_end = time.time()
process_time = time_update_end - time_update
db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end)
#通知任务完成
response_time = time.time()
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
applog.info(f'通知任务状态url:{file_id}:{response.url}')
applog.info(f'通知任务状态任务:{file_id}:{response.text}')
response_time_end = time.time()
process_time = response_time_end - response_time
db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end)
except Exception as e:
#通知任务完成
response_time = time.time()
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
response_time_end = time.time()
process_time = response_time_end - response_time
db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end)
applog.info(f'通知任务状态url:{file_id}:{response.url}')
applog.info(f'通知任务状态任务:{file_id}:{response.text}')
applog.info(f"Response status code: {response.status_code}")
applog.info(f"{file_id}运行失败: {e}")
finally:
applog.info(f"任务 {file_id} 完成")
else:
applog.info("有任务运行中,需要等待.....")
def parse_route(fileItem: FileItem):
# 创建一个队列,保证每次只执行一个文件解析任务
job_queue.put({
'file_path' : fileItem.file_path,
'file_id' : fileItem.file_id,
# 'type': fileItem.type
})
applog.info(f"增加 {fileItem.file_id} 到队列.")
threading.Thread(target=run_job, args=()).start()
return {"success": True, "msg": "文件解析开始"}
app.post("/parser/start",
tags=["parser"],
summary="解析Pdf文件",
)(parse_route)
# 运行 FastAPI 应用
if __name__ == "__main__":
# 服务器启动服务
# import uvicorn
#
# uvicorn.run(app, host="0.0.0.0", port=config.PORT)
# 本地调试任务
file_id = "201917"
job_queue.put({
'file_path': '1.docx',
'file_id': file_id,
})
db_service_word.delete_database(file_id)
run_job()

View File

@ -0,0 +1,251 @@
import camelot
import time
import re
import numpy as np
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import json
import utils
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i : i + n]
def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
"""
Divide the extraction work into n chunks. At the end of every chunk,
save data on disk and free RAM.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
"""
# get list of pages from camelot.handlers.PDFHandler
handler = camelot.handlers.PDFHandler(filepath)
page_list = handler._get_pages(pages=pages_num)
# chunk pages list
page_chunks = list(chunks(page_list, chunk_num))
# extraction and export
for chunk in page_chunks:
pages_string = str(chunk).replace("[", "").replace("]", "")
tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
tables.export(f"{export_path}/tables.csv")
# 读取pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
def get_pdf_info(file_path, pages):
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
pdf_info = []
tables_range = {}
for table_num, t in enumerate(tables):
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
arr = np.array(t.data)
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
pdf_info.append({
'top' : top,
'buttom' : buttom,
'page_num' : page_num,
'table_index' : table_index,
"type" : "table",
"data" : t.data,
'sort_num' : page_num*1000 - top
})
for pagenum, page in enumerate(extract_pages(file_path)):
page_elements = [(element.y1, element) for element in page._objs]
# 查找组成页面的元素
for i,component in enumerate(page_elements):
text_type = 'text'
# 提取页面布局的元素
element = component[1]
# 检查该元素是否为文本元素
if isinstance(element, LTTextBoxHorizontal):
# 检查文本是否出现在表中
line_text = element.get_text().replace('\n','')
line_text = re.sub(r"\s", "", line_text)
element_top = element.bbox[3]
element_buttom = element.bbox[1]
# 检查该文本是否出现在表中
if tables_range.get(pagenum+1):
for range in tables_range[pagenum+1]:
# print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
if element_top < range['top'] and element_top > range['buttom']:
pass
else:
if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
if i == 0:
text_type = get_text_type(line_text)
if text_type == 'page_header':
break
if utils.check_table_title_black_list(line_text):
print(line_text)
pdf_info.append({
'top' : element_top,
'buttom' : element_buttom,
'page_num' : range['page_num'],
'table_index' : range['table_index'],
"type" : text_type,
'content' : line_text,
'sort_num' : range['page_num']*1000 - element_top
})
break
#处理母公司表格标题在页面底部,完整表格在下一页
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
text_type = get_text_type(line_text)
if text_type == 'page_footer':
continue
pdf_info.append({
'top' : element_top,
'buttom' : element_buttom,
'page_num' : pagenum+1,
"type" : text_type,
'content' : line_text,
'sort_num' : (pagenum+1)*1000 - element_top
})
# print(f'{element_top}: {element_buttom}: {line_text}')
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
for info in sorted_pdf_info:
print(info)
def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num):
for range in tables_range[page_num]:
if top < range['top'] and top > range['buttom']:
return True
return False
def get_text_type(text: str):
first_re = '年度报告'
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
if re.search(first_re, text.strip()):
return 'page_header'
if page_number_pattern.match(text.strip()):
return 'page_footer'
return 'text'
def find_continuous_numbers(numbers):
# 首先对数组进行排序
numbers.sort()
# 初始化新数组和连续序列的开始索引
new_numbers = []
start_index = 0
# 遍历排序后的数组
for i in range(1, len(numbers)):
# 检查当前数字是否与前一个数字不连续
if numbers[i] != numbers[i-1] + 1:
# 如果当前数字与前一个数字不连续,处理连续序列
if i - start_index > 1:
# 如果连续序列长度大于1将最小数和最大数用"-"连接
new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
else:
# 如果连续序列长度为1直接添加数字
new_numbers.append(str(numbers[start_index]))
if start_index == i - 1:
new_numbers.append(str(numbers[i-1]))
# 更新连续序列的开始索引
start_index = i
# 处理数组末尾的连续序列
if len(numbers) - start_index > 1:
new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
else:
new_numbers.append(str(numbers[start_index]))
if start_index < len(numbers) - 1:
new_numbers.append(str(numbers[-1]))
return new_numbers
def merge_consecutive_arrays(file_path):
merged_objects = []
temp_array = {}
# 打开文件并读取每一行
with open(file_path, 'r') as file:
for line in file:
# 去除行尾的换行符
line = line.strip()
# 尝试将行转换成JSON格式
try:
obj = eval(line)
if obj['type'] == 'table':
# 如果对象是数组,将其元素添加到临时列表中
if not temp_array.get('page_num'):
temp_array = obj
else:
temp_array['data'].extend(obj['data'])
else:
# 如果对象不是数组,检查临时列表是否为空
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
except json.JSONDecodeError as e:
print(f"Error decoding JSON line: {e}")
if temp_array:
merged_objects.append(temp_array)
# 关闭文件
file.close()
return merged_objects
if __name__ == "__main__":
# print(get_text_type('6/223 '.strip()))
# start = time.time()
get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
# end = time.time()
# print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
# 示例数组
# numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
# # 调用函数并打印结果
# print(find_continuous_numbers(numbers))
# 示例数组对象列表
# 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下:
# df1 = tables[0].df
# df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引
# df3 = tables[1].df
# df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0])
# df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并ignore_index=True,表根据列名对齐合并生成新的index
# print(df__2)
# 调用函数并打印结果
# print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))

View File

@ -0,0 +1,9 @@
{"auto_id": true, "description": "", "fields":
[{"name": "pk", "description": "", "type": 5, "is_primary": true, "auto_id": true},
{"name": "vector", "description": "", "type": 101, "params": {"dim": 1536}},
{"name": "table_num", "description": "", "type": 3},
{"name": "table_index", "description": "", "type": 3},
{"name": "measure_name", "description": "", "type": 21, "params": {"max_length": 304}},
{"name": "measure_value", "description": "", "type": 21, "params": {"max_length": 100}},
{"name": "file_id", "description": "", "type": 21, "params": {"max_length": 50}},
{"name": "measure_unit", "description": "", "type": 21, "params": {"max_length": 50}}]}

33
zzb_data_word/config.py Normal file
View File

@ -0,0 +1,33 @@
MILVUS_CLIENT='http://124.70.129.232:19530'
#MILVUS_CLIENT='http://60.204.228.154:19530'
MYSQL_HOST = '121.37.185.246'
MYSQL_PORT = 3306
MYSQL_USER = 'financial'
MYSQL_PASSWORD = 'financial_8000'
MYSQL_DB = 'financial_report'
# NOTIFY_ADDR = 'http://192.168.0.175:8100/api/tenant/report/notify'
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
# REDIS_HOST = '127.0.0.1'
REDIS_HOST = '123.60.153.169'
REDIS_PORT = 6379
REDIS_PASSWORD = 'Xgf_redis'
FILE_PATH = '/root/word_parser/word/'
PORT = 8001
MEASURE_COUNT = 8
# MYSQL_HOST_APP = '192.168.0.201'#192.168.0.201
# MYSQL_PORT_APP = 3306
# MYSQL_USER_APP = 'root'
# MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
# MYSQL_DB_APP = 'financial_report_prod'
MYSQL_HOST_APP = '121.37.185.246'#192.168.0.201
MYSQL_PORT_APP = 3306
MYSQL_USER_APP = 'financial'
MYSQL_PASSWORD_APP = 'financial_8000'
MYSQL_DB_APP = 'financial_report'

View File

@ -0,0 +1,260 @@
#coding=utf-8
import sys,ast
# from pdfminer.high_level import extract_text
# from pdfminer.pdfparser import PDFParser
# from pdfminer.pdfdocument import PDFDocument
# from pdfminer.pdfpage import PDFPage
import utils
import mysql.connector
# from pymilvus import connections,MilvusClient
import json,time
# import db_service
import ast
import numpy as np
import config_p
import redis_service
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
# import main
import redis
def run_job(sec):
time.sleep(sec)
def measure_config_to_db(conn,cursor):
insert_query = '''
INSERT INTO measure_config_half_year
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
VALUES (%s, %s, %s, %s, %s)
'''
# 打开文本文件
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
# 读取所有行到一个列表中
lines = file.readlines()
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor):
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
# 执行SQL语句更新数据
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
'''
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
# def contains_financial_indicators(text):
# import re
# # 正则表达式模式匹配千分位格式的数字和百分比
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
# pattern1 = r"\d+(.\d+)+%?"
# # 使用 re.search 函数查找匹配项
# match = re.search(pattern1, text)
# # 如果找到匹配项,返回 True否则返回 False
# return bool(match)
# def get_clean_text(text):
# import re
# pattern = r"\[^)]*?\"
# matches = re.findall(pattern, text)
# for match in matches:
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
# month_keywords_found = re.search(r"归属于|扣非", match)
# if not month_keywords_found:
# # 如果包含,则从文本中删除该部分
# text = re.sub(pattern,"", text)
# else:
# # 如果不包含,删除所有标点符号和中文数字
# text = re.sub(r"[^\w\s]", "", text)
# print(text)
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
# # #通过向量查询指标
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
# # #指标归一化处理
# db_service.update_ori_measure(conn,cursor,file_id)
# def print_measure_data(cursor,client):
# select_query = '''
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
# '''
# cursor.execute(select_query)
# records = cursor.fetchall()
# for record in records:
# ori_measure_name = record[0]
# measure_name = record[1]
# ori_measure_id = record[2]
# measure_vector = redis_service.read_from_redis(ori_measure_id)
# measure_list = ast.literal_eval(measure_vector)
# data = [measure_list]
# res = client.search(
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=2, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name","measure_value","table_num","table_index"],
# filter = 'file_id == "64"'
# )
# vector_str = measure_name+":"+ori_measure_name
# # Convert the output to a formatted JSON string
# for i in range(len(res[0])):
# vector_distance = float(res[0][i]["distance"])
# vector_measure_name = res[0][i]["entity"]["measure_name"]
# measure_value = res[0][i]["entity"]["measure_value"]
# table_num = res[0][i]["entity"]["table_num"]
# table_index = res[0][i]["entity"]["table_index"]
# table_num_list = [106]
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
# # if vector_distance > 0.89 and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
# # if vector_distance > distance and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
if __name__ == "__main__":
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
# print(vector)
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# measure_config_to_db(conn,cursor)
insert_measure_vector(conn,cursor)
# cursor.close()
# conn.close()
# import re
# text = '减少11.04百分点'
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
# print('找到了单位。')
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
# match = unit_pattern.search(text)
# print(len(match.groups()))
# if match:
# print(f'找到单位。')
# else:
# print(f'没有找到单位。')
# row1 = ['比例','比率','占比','费用']
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
# for i in range(len(row1)):
# for j in range(len(row2)):
# print(f"{row1[i]}{row2[j]}")
# import os,re
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
# # 获取文件名和扩展名
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
# file_base_name = re.sub(r'\d+', '', file_base_name)
# print(f'文件名: {file_base_name}')
# import re
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
# import threading
# # 创建一个ThreadLocal变量
# local_data = threading.local()
# # 定义一个线程执行的工作函数
# def worker():
# # 为当前线程的ThreadLocal变量设置一个值
# local_data.data = f"Thread {threading.current_thread().name}'s data"
# print(local_data.data)
# # 创建并启动多个线程
# threads = []
# for i in range(3):
# thread = threading.Thread(target=worker)
# thread.start()
# threads.append(thread)
# # 等待所有线程完成
# for thread in threads:
# thread.join()
# for i in range(2,5):
# print(i)
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
# client = MilvusClient(
# uri= MILVUS_CLIENT
# )
# conn = mysql.connector.connect(
# host=MYSQL_HOST,
# user=MYSQL_USER,
# password=MYSQL_PASSWORD,
# database=MYSQL_DB
# )
# cursor = conn.cursor()
# print_measure_data(cursor,client)
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
# redis_service.read_from_redis()
# parent_table_pages = []
# file_id = '67'
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
# db_service.update_ori_measure(conn,cursor,file_id)
# main.get_table_measure(path,'all',file_id)
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
# measure_config_to_db(conn,cursor)
# params = ['f_102','f_103',]
# for param in params:
# globals()[param] = param.replace('f_','')
# # insert_measure_vector(conn,cursor)
# print(globals()['f_102'])
# db_service.update_ori_measure(conn,cursor,file_id)
# conn.commit()
# cursor.close()
# conn.close()
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
# count_range_parts = utils.get_range(2300)
# print(count_range_parts)

33
zzb_data_word/config_p.py Normal file
View File

@ -0,0 +1,33 @@
MILVUS_CLIENT='http://127.0.0.1:19530'
#MILVUS_CLIENT='http://60.204.228.154:19530'
# MYSQL_HOST = '121.37.185.246'
# MYSQL_PORT = 3306
# MYSQL_USER = 'financial'
# MYSQL_PASSWORD = 'financial_8000'
# MYSQL_DB = 'financial_report'
NOTIFY_ADDR = 'http://192.168.0.166:8100/api/tenant/report/notify'
# NOTIFY_ADDR_ID = 'http://192.168.0.175:8100/api/tenant/info/notify'
# NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
REDIS_HOST = '192.168.0.172'
# REDIS_HOST = '123.60.153.169'
REDIS_PORT = 6379
REDIS_PASSWORD = 'Xgf_redis'
FILE_PATH = '/root/pdf_parser/word/'
PORT = 8001
MEASURE_COUNT = 8
MYSQL_HOST = '192.168.0.142'#192.168.0.201
MYSQL_PORT = 3306
MYSQL_USER = 'financial_prod'
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB = 'financial_report_prod'
MYSQL_HOST_APP = '192.168.0.142'#192.168.0.201
MYSQL_PORT_APP = 3306
MYSQL_USER_APP = 'financial_prod'
MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB_APP = 'financial_report_prod'

View File

@ -0,0 +1,15 @@
MILVUS_CLIENT='http://127.0.0.1:19530'
MILVUS_HOST = '127.0.0.1'
MILVUS_PORT = 19530
MYSQL_HOST = '75e59185a2624316882c98206dbe4c49in01.internal.cn-east-3.mysql.rds.myhuaweicloud.com'
MYSQL_PORT = 3306
MYSQL_USER = 'financial_prod'
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB = 'financial_report_prod'
NOTIFY_ADDR = 'http://192.168.0.166:8100/api/tenant/report/notify'
FILE_PATH = '/root/pdf_parser/pdf/'
REDIS_HOST = '192.168.0.172'
REDIS_PORT = 6379
REDIS_PASSWORD = 'Xgf_redis'
PORT = 8000
MEASURE_COUNT = 8

1015
zzb_data_word/db_service.py Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

119
zzb_data_word/excel.py Normal file
View File

@ -0,0 +1,119 @@
import pandas as pd
import json
import utils
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
import mysql.connector
# 读取 Excel 文件
df = pd.read_excel('/Users/zhengfei/Desktop/cb/ttt.xlsx', header=0)
# 将 DataFrame 转换为字典列表
data_list = df.to_dict(orient='records')
year = 2023
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor()
insert_query = '''
INSERT INTO measure_create_config
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
VALUES (%s, %s, %s, %s, %s, %s)
'''
for data in data_list:
show_measure = str(data['指标'])
same_mean_measure = str(data['同义表述'])
period_measure = str(data['周期'])
change_measure = str(data['变动'])
black_list = str(data['黑名单词'])
config_id = utils.get_md5(show_measure)
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
cursor.execute(insert_query, insert_query_data)
conn.commit()
# 读取 Excel 文件
df_period = pd.read_excel('/Users/zhengfei/Desktop/cb/period.xlsx', header=0)
# 将 DataFrame 转换为字典列表
period_list = df_period.to_dict(orient='records')
period_insert_query = '''
INSERT INTO measure_create_period
(period_name, same_mean_period)
VALUES (%s, %s)
'''
for data in period_list:
period_name = str(data['标准表述'])
same_mean_period = str(data['同义表述'])
insert_query_data = (period_name, same_mean_period)
cursor.execute(period_insert_query, insert_query_data)
conn.commit()
data_query = '''
SELECT * FROM measure_create_config where delete_status = 0
'''
period_query = '''
SELECT * FROM measure_create_period
'''
cursor.execute(data_query)
data_list = cursor.fetchall()
cursor.execute(period_query)
period_list = cursor.fetchall()
for data in data_list:
config_id = data[0]
show_measure = data[1]
same_mean_measure = data[2]
period_measure = data[3]
change_measure = data[4]
same_mean_measure_arr = []
period_measure_arr = []
change_measure_arr = []
if same_mean_measure != 'nan' :
same_mean_measure_arr = same_mean_measure.split(',')
same_mean_measure_arr.append(show_measure)
if period_measure != 'nan' :
period_measure_arr = period_measure.split(',')
if change_measure != 'nan' :
change_measure_arr = change_measure.split(',')
for c in change_measure_arr:
period_measure_arr.append(c)
for x in period_measure_arr:
if x in change_measure_arr:
show_name = show_measure+x
else:
show_name = x+show_measure
for y in same_mean_measure_arr:
if x in change_measure:
parser_name = y+x
else:
parser_name = x+y
print(f'{show_name},{parser_name}')
for p in period_list:
period_exra_name = p[0]
period_exra_value = p[1]
if x.find(period_exra_name) != -1:
for v in period_exra_value.split(','):
if x in change_measure:
parser_name = y + x.replace(period_exra_name, v)
else:
parser_name = x.replace(period_exra_name, v) + y
print(f'{show_name},{parser_name}')
cursor.close()
conn.close()

View File

@ -0,0 +1,72 @@
import pandas as pd
import json
import utils
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
import mysql.connector
def getId(name):
categorys = [
{
name: "术后康复",
id: 1
},
{
name: "运动损伤康复",
id: 2
},
{
name: "慢病康复",
id: 3
},
{
name: "运动训练",
id: 4
},
{
name: "健康科普",
id: 5
},
]
object_dict = {obj[name]: obj[id] for obj in categorys}
if name in object_dict:
return object_dict[name]
else:
return 6
# 读取 Excel 文件
df = pd.read_excel('/Users/zhengfei/Desktop/book.xlsx', header=0)
# 将 DataFrame 转换为字典列表
data_list = df.to_dict(orient='records')
conn = mysql.connector.connect(
host = 'rm-bp1vns6jjy6yu46lhio.mysql.rds.aliyuncs.com',
user = 'hematiyu',
password = '00a09f971769499f8c0495505ab0922C',
database = 'km'
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor()
for data in data_list:
print(data)
book_name = str(data['书名']).replace('\n', '')
category = str(data['分类'])
category_name = category.split(',')[0]
category = getId(category_name)
keywords = str(data['关键词'])
if keywords == 'nan':
keywords = ''
insert_query = '''
update km_doc set category = {category},keywords = '{keywords}',source = 1 where title = '{book_name}'
'''.format(book_name=book_name, category=category, keywords=keywords)
print(insert_query)
cursor.execute(insert_query)
conn.commit()
cursor.close()
conn.close()

22
zzb_data_word/get_pdf.py Normal file
View File

@ -0,0 +1,22 @@
import os
import shutil
# 数组列表中的数字编号
numbers = ['837242','830839','837212','830832','430510','835670','837092','831689','832419','831278','838171','834261','430476','831195','872190','833394','872953','831304','832651','873132','832651','600060','600076','600180','600188','600212','600219','600223','600229','600308','600309','600319','600336','600350','600426','600448','600467','600529','600547','600579','600586','600587','600600','600690','600727','600735','600756','600760','600777','600783','600784','600789','600804','600807','600858','600898','600960','600966','600022','600027','600017','601678','601058','601028','603167','603798','603779','603421','603612','603021','601366','603367','601966','603029','603639','603026','603858','603223','601163','603708','603577','603086','603638','603217','603536','603113','603586','603856','601019','600918','603967','605006','603278','603279','603755','603739','601298','603187','605198','688002','605001','605100','601665','603102','688579','688309','605287','605016','688556','605589','688677','688191','688663','688681','600955','603836','688087','605567','603182','603190','603151','601096','688695','603270','603285','688002','688139','688363','688021','688579','688309','688556','688557','688677','688136','688191','688161','688501','688663','688681','688087','688190','688234','688331','688455','688035','688695','000407','000409','000423','000488','000498','000503','000506','000554','000599','000639','000655','000668','000677','000680','000682','000720','000726','000756','000811','000822','000830','000869','000880','000915','000951','000957','000977','002026','002041','002073','002078','002083','002086','002088','002094','002107','002111','002117','000338','002254','002193','002237','002234','002242','002241','002248','002270','002283','002286','002490','300001','002323','002330','002339','002355','002353','002374','002376','002363','002379','002382','002408','300105','002469','002458','300099','300321','300110','002476','002470','002595','002481','300121','002485','002521','002498','002643','002805','300143','002526','002537','300175','002545','300183','300185','300214','002581','300208','002580','300224','002589','300233','300237','002588','002598','300243','300285','002655','002675','300308','002671','300343','002746','002838','002726','002768','300443','300479','002793','300423','002810','300569','300659','300583','002871','300996','300699','300801','300653','300690','002890','300677','002891','003033','002921','002899','002958','002948','301017','300786','300848','301035','300654','300594','300779','301015','002984','301020','300830','300821','300840','003022','301199','301299','300918','300950','300948','300993','003042','001207','301022','001219','301069','301185','301149','301188','301296','301158','301439','301206','301262','301209','301320','301281','301337','301456','001260','001300','001331','301292','301498','001379']
# 源目录和目标目录
source_dir = '/Users/zhengfei/Desktop/cb'
target_dir = '/Users/zhengfei/Desktop/sandong'
# 遍历源目录
for root, dirs, files in os.walk(source_dir):
for file in files:
numbers.remove
# # 检查文件名中是否包含数组列表中的某个值
if any(str(number) in file for number in numbers):
# 获取文件完整路径
file_path = os.path.join(root, file)
# 获取目标目录的完整路径
target_path = os.path.join(target_dir, file)
# 拷贝文件
shutil.copy2(file_path, target_path)
print(f"文件 {file_path} 已拷贝到 {target_dir}")

3713
zzb_data_word/half_year.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,99 @@
#coding=utf-8
import random
from http import HTTPStatus
from dashscope import Generation
from datetime import datetime
# 文本和表格数据给大模型,返回大模型抽取原始指标列表
def get_measure_from_llm(user_prompt):
"""
:return: 文本和表格数据给大模型返回大模型抽取原始指标列表
"""
llm_measure_list = []
system_prompt = '''
你是一个优秀的金融分析师从给定的数据报告中自动提取以下关键财务指标指标包括
2023年营业收入
2022年营业收入
2021年营业收入
2023年第一季度营业收入
2023年第二季度营业收入
2023年第三季度营业收入
2023年第四季度营业收入
营业收入同比变动
2023年归母净利润
2022年归母净利润
2021年归母净利润
2023年第一季度归母净利润
2023年第二季度归母净利润
2023年第三季度归母净利润
2023年第四季度归母净利润
归母净利润同比变动
2023年扣非净利润
2022年扣非净利润
2021年扣非净利润
2023年第一季度扣非净利润
2023年第二季度扣非净利润
2023年第三季度扣非净利润
2023年第四季度扣非净利润
扣非净利润同比变动
2023年经营活动现金流净额
2022年经营活动现金流净额
2021年经营活动现金流净额
经营活动现金流净额同比变动
2023年筹资活动现金流净额
2022年筹资活动现金流净额
2021年筹资活动现金流净额
2023年投资活动现金流净额
2022年投资活动现金流净额
2021年投资活动现金流净额
2023年非经常性损益
2022年非经常性损益
2021年非经常性损益
2023年基本每股收益
2022年基本每股收益
2021年基本每股收益
2023年稀释每股收益
2022年稀释每股收益
2021年稀释每股收益
2023年加权平均净资产收益率
2022年加权平均净资产收益率
2021年加权平均净资产收益率
2023年扣非加权平均净资产收益率
2022年扣非加权平均净资产收益率
2021年扣非加权平均净资产收益率
请确保只抽取这些指标并且每个指标的输出格式为指标名:指标值,只需要按格式输出不要增加其他内容所有的指标值必须从用户提供的信息中抽取不允许自己生成如果找不到相关指标指标值显示为-
<数据报告>
<user_prompt>
</数据报告>
'''
system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
response = Generation.call(
model='qwen-plus',
prompt = system_prompt,
seed=random.randint(1, 10000),
top_p=0.8,
result_format='message',
enable_search=False,
max_tokens=1500,
temperature=0.85,
repetition_penalty=1.0
)
if response.status_code == HTTPStatus.OK:
result = response['output']['choices'][0]['message']['content']
llm_measure_list = result.split('\n')
return llm_measure_list
else:
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
response.request_id, response.status_code,
response.code, response.message
))
return "llm_error"
if __name__ == '__main__':
user_prompt = '''
经营情况回顾 () 经营计划 2023 在国际环境复杂多变以及全球经济依旧下行的形势下公司严格按照既定发展战略和经营计划狠抓落实迎难而上业务经营整体保持稳定如期完成全年既定经营目标在全体职员的共同努力下公司的营业收入净利润等各项指标再创历史新高营业收入较上年同期实现15.43%的增长归属于上市公司股东的净利润较上年同期实现 26.47%的增长 1财务状况 报告期末公司资产总额为 1,473,271,310.23 增幅为 19.17%主要系一方面随着销售规模的不断增长公司应收账款及合同资产等流动资产增幅较大另一方面为解决基于销售规模扩大引致的产能跟不上的瓶颈公司上马扩产建设项目导致在建工程固定资产等非流动资产增幅较报告期末公司负债总额为 800,619,067.70 增幅为 26.12%主要系随着销售规模增加工程建设项目推进固定资产购置等公司采购数额大幅增加公司通过银行借款等方式筹集资金导致长短期贷款期末余额增幅较大 报告期末归属于上市公司股东的净资产为 670,316,339.35 增幅为 11.45%主要系报告期内经营积累 2经营成果 报告期内公司实现营业收入 1,003,535,799.51 增幅为 15.43%主要系公司本期持续优化生产经营大力推进产品研发和创新抓住双碳政策以及能效提升产生的市场需求旺盛的有利时机且随着公司北交所上市产品品牌效应凸显产能增加订单获取能力增强变压器及户外成套设备销售增长较多 营业成本为 810,779,075.89 增幅为 15.33%主要系报告期内销售增长及主要原材料价格变动所致归属于上市公司股东的净利润为 73,033,633.31 增幅为 26.47%主要系1公司持续优化生产经营大力推进产品研发和创新抓住双碳政策以及能效提升产生的市场需求旺盛的有利时机生产和销售均呈稳定增长2本期处置开源路 1-1 号土地及建筑物及其他附属物等结转资产处置收益同比增加
'''
measure_list = get_measure_from_llm(user_prompt)
print(measure_list)

View File

@ -0,0 +1,204 @@
2024-12-29 16:13:29,975|zzb_logger : INFO 开始启动文件解析任务: 1.docx
2024-12-29 16:13:36,106|zzb_logger : INFO 任务 201917 完成
2024-12-29 16:15:16,205|zzb_logger : INFO 开始启动文件解析任务: 1.docx
2024-12-29 16:15:22,356|zzb_logger : INFO 任务 201917 完成
2024-12-29 16:17:15,693|zzb_logger : INFO 开始启动文件解析任务: 1.docx
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-12-29 16:17:25,319|zzb_logger : INFO text任务ID:201917
2024-12-29 16:17:26,701|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (5116)...
2024-12-29 16:17:28,173|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (22268)...
2024-12-29 16:17:29,591|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (27736)...
2024-12-29 16:17:30,937|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38276)...
2024-12-29 16:17:32,294|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38292)...
2024-12-29 16:17:33,664|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38240)...
2024-12-29 16:17:35,153|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28536)...
2024-12-29 16:17:36,559|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37552)...
2024-12-29 16:17:37,929|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37856)...
2024-12-29 16:17:39,291|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (10528)...
2024-12-29 16:17:40,688|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (31444)...
2024-12-29 16:17:42,133|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (11108)...
2024-12-29 16:17:43,518|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23236)...
2024-12-29 16:17:44,901|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23572)...
2024-12-29 16:17:46,495|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (39604)...
2024-12-29 16:17:47,899|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (4076)...
2024-12-29 16:17:47,899|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-12-29 16:18:02,194|zzb_logger : INFO word表格中 text解析完成任务ID:201917
2024-12-29 16:18:02,196|zzb_logger : INFO 开始解析word表表格中的table任务ID:201917
2024-12-29 16:18:03,525|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36176)...
2024-12-29 16:18:04,585|zzb_logger : INFO Task 解析表格201917 runs 1.06 seconds.
2024-12-29 16:18:04,873|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35368)...
2024-12-29 16:18:05,769|zzb_logger : INFO Task 解析表格201917 runs 0.90 seconds.
2024-12-29 16:18:06,263|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (33004)...
2024-12-29 16:18:07,225|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
2024-12-29 16:18:07,628|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (30764)...
2024-12-29 16:18:08,427|zzb_logger : INFO Task 解析表格201917 runs 0.80 seconds.
2024-12-29 16:18:08,976|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (29608)...
2024-12-29 16:18:09,864|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
2024-12-29 16:18:10,588|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (5404)...
2024-12-29 16:18:11,360|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
2024-12-29 16:18:11,966|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36200)...
2024-12-29 16:18:12,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36328)...
2024-12-29 16:18:12,892|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
2024-12-29 16:18:13,034|zzb_logger : INFO Task 解析表格201917 runs 1.00 seconds.
2024-12-29 16:18:13,392|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39712)...
2024-12-29 16:18:14,166|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
2024-12-29 16:18:15,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17184)...
2024-12-29 16:18:15,084|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38828)...
2024-12-29 16:18:15,156|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39596)...
2024-12-29 16:18:15,194|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36908)...
2024-12-29 16:18:15,268|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38088)...
2024-12-29 16:18:15,273|zzb_logger : INFO 解析表格时出现了异常 setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part. 内容为{'type': 'table', 'index': 1438, 'data': [['项目', '期末', '期末', '期末', '期末', '期末', '期初', '期初', '期初', '期初', '期初', '期初', '期初', '期初'], ['', '账面余额', '账面价值', '受限类型', '受限情况', '受限情况', '账面余额', '账面余额', '账面价值', '账面价值', '受限类型', '受限类型', '受限情况', ''], ['货币资金', '485,532.72', '485,532.72', '', '住房专用基金', '住房专用基金', '482,151.75', '482,151.75', '482,151.75', '482,151.75', '', '', '住房专用基金', ''], ['固定资产', '9,798,299.46', '9,798,299.46', '', '金融机构借款抵押', '3,747,470.09', '3,747,470.09', '3,747,470.09', '3,747,470.09', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['无形资产', '7,982,261.87', '7,982,261.87', '', '金融机构借款抵押', '5,437,462.92', '5,437,462.92', '5,437,462.92', '5,437,462.92', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['货币资金', '43,997,452.57', '43,997,452.57', '', '银行保证金', '63,388,483.00', '63,388,483.00', '63,388,483.00', '63,388,483.00', '', '', '银行保证金', '银行保证金'], ['投资性房地产', '62,041,831.52', '62,041,831.52', '', '金融机构借款抵押', '67,653,392.10', '67,653,392.10', '67,653,392.10', '67,653,392.10', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['合计', '124,305,378.14', '124,305,378.14', '', '', '140,708,959.86', '140,708,959.86', '140,708,959.86', '140,708,959.86', '', '', '', '']]}
2024-12-29 16:18:15,722|zzb_logger : INFO Task 解析表格201917 runs 0.69 seconds.
2024-12-29 16:18:15,873|zzb_logger : INFO Task 解析表格201917 runs 0.79 seconds.
2024-12-29 16:18:16,067|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
2024-12-29 16:18:16,086|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
2024-12-29 16:18:16,158|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
2024-12-29 16:18:16,787|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39052)...
2024-12-29 16:18:16,847|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35928)...
2024-12-29 16:18:17,456|zzb_logger : INFO Task 解析表格201917 runs 0.61 seconds.
2024-12-29 16:18:17,644|zzb_logger : INFO Task 解析表格201917 runs 0.86 seconds.
2024-12-29 16:18:17,819|zzb_logger : INFO word表格中 table解析完成任务ID:201917
2024-12-29 16:18:17,985|zzb_logger : INFO 解析任务 201917 完成耗时62.29 秒。
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-12-29 16:18:18,107|zzb_logger : INFO 开始表格指标抽取任务ID:201917
2024-12-29 16:18:20,187|zzb_logger : INFO 提取指标任务 0-10 (29656)...
2024-12-29 16:18:21,575|zzb_logger : INFO 提取指标任务 10-20 (38952)...
2024-12-29 16:18:22,849|zzb_logger : INFO 提取指标任务 20-30 (31900)...
2024-12-29 16:18:24,192|zzb_logger : INFO 提取指标任务 30-40 (30420)...
2024-12-29 16:18:25,554|zzb_logger : INFO 提取指标任务 40-50 (32448)...
2024-12-29 16:18:26,909|zzb_logger : INFO 提取指标任务 50-60 (37708)...
2024-12-29 16:18:28,305|zzb_logger : INFO 提取指标任务 60-70 (36136)...
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,936|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:29,637|zzb_logger : INFO 提取指标任务 70-80 (39120)...
2024-12-29 16:18:42,814|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:46,511|zzb_logger : INFO 提取指标 40-50 runs 20.96 seconds.
2024-12-29 16:18:54,027|zzb_logger : INFO 提取指标 70-80 runs 24.39 seconds.
2024-12-29 16:19:17,236|zzb_logger : INFO 提取指标 60-70 runs 48.93 seconds.
2024-12-29 16:19:20,151|zzb_logger : INFO 提取指标 30-40 runs 55.96 seconds.
2024-12-29 16:19:40,383|zzb_logger : INFO 提取指标 50-60 runs 73.47 seconds.
2024-12-29 16:20:06,573|zzb_logger : INFO 提取指标 0-10 runs 106.39 seconds.
2024-12-29 16:20:44,937|zzb_logger : INFO 提取指标 10-20 runs 143.36 seconds.
2024-12-29 16:20:50,959|zzb_logger : INFO 提取指标 20-30 runs 148.11 seconds.
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取完成任务ID:201917
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取 201917 完成耗时153.23 秒。
2024-12-29 16:20:51,337|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
2024-12-29 16:20:51,549|zzb_logger : INFO 目录黑名单为:[]
2024-12-29 16:20:52,316|zzb_logger : INFO 向量配置数据查询 0.11 秒。
2024-12-29 16:20:52,317|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
2024-12-29 16:20:54,191|zzb_logger : INFO Run task 0-351 (41216)...
2024-12-29 16:20:54,192|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:54,742|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:20:55,664|zzb_logger : INFO Run task 351-702 (16388)...
2024-12-29 16:20:55,664|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:56,152|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:20:57,120|zzb_logger : INFO Run task 702-1053 (41796)...
2024-12-29 16:20:57,120|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:57,611|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:20:58,818|zzb_logger : INFO Run task 1053-1404 (39320)...
2024-12-29 16:20:58,818|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:59,324|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:00,159|zzb_logger : INFO Run task 1404-1755 (41868)...
2024-12-29 16:21:00,159|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:00,887|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:01,473|zzb_logger : INFO Run task 1755-2106 (26816)...
2024-12-29 16:21:01,473|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:02,171|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:02,832|zzb_logger : INFO Run task 2106-2457 (32120)...
2024-12-29 16:21:02,832|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:03,703|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:04,179|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-12-29 16:21:04,179|zzb_logger : INFO Run task 2457-2815 (38332)...
2024-12-29 16:21:04,179|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:04,886|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:23:00,285|zzb_logger : INFO 所有子任务完成任务ID:201917
2024-12-29 16:23:00,286|zzb_logger : INFO 启动指标归一化任务ID:201917
2024-12-29 16:23:00,286|zzb_logger : INFO 向量更新时间 127.97 秒。
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据查询 0.17 秒。
2024-12-29 16:23:00,474|zzb_logger : INFO update_ori_measure方法走的是半年报
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据更新 0.00 秒。
2024-12-29 16:23:00,522|zzb_logger : INFO 更新数据写入 0.05 秒。
2024-12-29 16:23:00,522|zzb_logger : INFO 归一化完成任务ID:201917
2024-12-29 16:23:00,522|zzb_logger : INFO 任务 201917 完成耗时344.83 秒。
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-12-29 16:23:00,821|zzb_logger : INFO 任务 201917 完成

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,162 @@
2024-11-25 15:33:22,588|zzb_logger : INFO 开始启动文件解析任务: 103.docx
2024-11-25 15:33:22,593|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
2024-11-25 15:33:22,593|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-11-25 15:33:28,433|zzb_logger : INFO text任务ID:201917
2024-11-25 15:33:29,616|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (14328)...
2024-11-25 15:33:31,068|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28108)...
2024-11-25 15:33:32,200|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (19476)...
2024-11-25 15:33:33,366|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (17332)...
2024-11-25 15:33:34,692|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23168)...
2024-11-25 15:33:35,803|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (26276)...
2024-11-25 15:33:36,919|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (20716)...
2024-11-25 15:33:38,051|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (1760)...
2024-11-25 15:33:39,160|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (13296)...
2024-11-25 15:33:40,302|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (8592)...
2024-11-25 15:33:41,406|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (20664)...
2024-11-25 15:33:42,511|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (21840)...
2024-11-25 15:33:43,619|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (19108)...
2024-11-25 15:33:44,744|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (29096)...
2024-11-25 15:33:45,854|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (17024)...
2024-11-25 15:33:47,001|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (18668)...
2024-11-25 15:33:47,001|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-11-25 15:34:03,934|zzb_logger : INFO word表格中 text解析完成任务ID:201917
2024-11-25 15:34:03,936|zzb_logger : INFO 开始解析word表表格中的table任务ID:201917
2024-11-25 15:34:05,071|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (7472)...
2024-11-25 15:34:06,182|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (19500)...
2024-11-25 15:34:06,445|zzb_logger : INFO Task 解析表格201917 runs 1.37 seconds.
2024-11-25 15:34:07,083|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-11-25 15:34:07,641|zzb_logger : INFO Task 解析表格201917 runs 1.46 seconds.
2024-11-25 15:34:08,265|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (20888)...
2024-11-25 15:34:08,386|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (28568)...
2024-11-25 15:34:08,464|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (26716)...
2024-11-25 15:34:08,592|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (27376)...
2024-11-25 15:34:08,663|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (12360)...
2024-11-25 15:34:08,791|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (28692)...
2024-11-25 15:34:08,797|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (11684)...
2024-11-25 15:34:08,892|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (21064)...
2024-11-25 15:34:08,948|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (24608)...
2024-11-25 15:34:08,994|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (8632)...
2024-11-25 15:34:09,098|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (23436)...
2024-11-25 15:34:09,138|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (15992)...
2024-11-25 15:34:09,176|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (9844)...
2024-11-25 15:34:09,219|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17936)...
2024-11-25 15:34:09,298|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
2024-11-25 15:34:09,399|zzb_logger : INFO Task 解析表格201917 runs 1.13 seconds.
2024-11-25 15:34:09,428|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
2024-11-25 15:34:09,565|zzb_logger : INFO Task 解析表格201917 runs 0.97 seconds.
2024-11-25 15:34:09,637|zzb_logger : INFO Task 解析表格201917 runs 0.84 seconds.
2024-11-25 15:34:09,963|zzb_logger : INFO Task 解析表格201917 runs 1.01 seconds.
2024-11-25 15:34:10,020|zzb_logger : INFO Task 解析表格201917 runs 1.23 seconds.
2024-11-25 15:34:10,036|zzb_logger : INFO Task 解析表格201917 runs 1.37 seconds.
2024-11-25 15:34:10,073|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
2024-11-25 15:34:10,168|zzb_logger : INFO Task 解析表格201917 runs 1.28 seconds.
2024-11-25 15:34:10,223|zzb_logger : INFO Task 解析表格201917 runs 1.12 seconds.
2024-11-25 15:34:10,265|zzb_logger : INFO Task 解析表格201917 runs 1.27 seconds.
2024-11-25 15:34:10,304|zzb_logger : INFO Task 解析表格201917 runs 1.13 seconds.
2024-11-25 15:34:10,404|zzb_logger : INFO Task 解析表格201917 runs 1.18 seconds.
2024-11-25 15:34:10,557|zzb_logger : INFO word表格中 table解析完成任务ID:201917
2024-11-25 15:34:10,728|zzb_logger : INFO 解析任务 201917 完成耗时48.14 秒。
2024-11-25 15:34:10,879|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
2024-11-25 15:34:10,879|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-11-25 15:34:10,879|zzb_logger : INFO 开始表格指标抽取任务ID:201917
2024-11-25 15:34:12,902|zzb_logger : INFO 提取指标任务 0-8 (20908)...
2024-11-25 15:34:13,964|zzb_logger : INFO 提取指标任务 8-16 (23592)...
2024-11-25 15:34:15,047|zzb_logger : INFO 提取指标任务 16-24 (12664)...
2024-11-25 15:34:16,203|zzb_logger : INFO 提取指标任务 24-32 (29872)...
2024-11-25 15:34:17,576|zzb_logger : INFO 提取指标任务 32-40 (28748)...
2024-11-25 15:34:18,385|zzb_logger : INFO 提取指标任务 40-48 (2204)...
2024-11-25 15:34:19,517|zzb_logger : INFO 提取指标任务 48-56 (22344)...
2024-11-25 15:34:20,613|zzb_logger : INFO 提取指标任务 56-66 (18352)...
2024-11-25 15:34:26,136|zzb_logger : INFO 提取指标 48-56 runs 6.62 seconds.
2024-11-25 15:34:36,392|zzb_logger : INFO 提取指标 24-32 runs 20.19 seconds.
2024-11-25 15:34:43,329|zzb_logger : INFO 提取指标 56-66 runs 22.72 seconds.
2024-11-25 15:34:47,575|zzb_logger : INFO 提取指标 40-48 runs 29.19 seconds.
2024-11-25 15:34:56,075|zzb_logger : INFO 提取指标 16-24 runs 41.03 seconds.
2024-11-25 15:34:59,737|zzb_logger : INFO 提取指标 32-40 runs 42.16 seconds.
2024-11-25 15:35:26,785|zzb_logger : INFO 提取指标 0-8 runs 73.88 seconds.
2024-11-25 15:36:47,235|zzb_logger : INFO 提取指标 8-16 runs 153.27 seconds.
2024-11-25 15:36:47,522|zzb_logger : INFO 表格指标抽取完成任务ID:201917
2024-11-25 15:36:47,522|zzb_logger : INFO 表格指标抽取 201917 完成耗时156.64 秒。
2024-11-25 15:36:47,523|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
2024-11-25 15:36:47,750|zzb_logger : INFO 目录黑名单为:[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943]
2024-11-25 15:36:48,656|zzb_logger : INFO 向量配置数据查询 0.41 秒。
2024-11-25 15:36:48,658|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
2024-11-25 15:36:49,797|zzb_logger : INFO Run task 0-351 (6964)...
2024-11-25 15:36:49,797|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:50,291|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:50,925|zzb_logger : INFO Run task 351-702 (17576)...
2024-11-25 15:36:50,925|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:51,324|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:52,083|zzb_logger : INFO Run task 702-1053 (1308)...
2024-11-25 15:36:52,083|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:52,569|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:53,251|zzb_logger : INFO Run task 1053-1404 (24420)...
2024-11-25 15:36:53,251|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:54,430|zzb_logger : INFO Run task 1404-1755 (27824)...
2024-11-25 15:36:54,430|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:55,150|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:55,181|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:55,608|zzb_logger : INFO Run task 1755-2106 (22624)...
2024-11-25 15:36:55,608|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:56,069|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:56,789|zzb_logger : INFO Run task 2106-2457 (23664)...
2024-11-25 15:36:56,789|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:57,633|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:36:58,127|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-11-25 15:36:58,127|zzb_logger : INFO Run task 2457-2815 (10160)...
2024-11-25 15:36:58,127|zzb_logger : INFO 插入数据 2815
2024-11-25 15:36:58,816|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
2024-11-25 15:39:18,387|zzb_logger : INFO 所有子任务完成任务ID:201917
2024-11-25 15:39:18,387|zzb_logger : INFO 启动指标归一化任务ID:201917
2024-11-25 15:39:18,387|zzb_logger : INFO 向量更新时间 149.73 秒。
2024-11-25 15:39:18,548|zzb_logger : INFO 更新数据查询 0.14 秒。
2024-11-25 15:39:18,548|zzb_logger : INFO update_ori_measure方法走的是半年报
2024-11-25 15:39:18,548|zzb_logger : INFO 更新数据更新 0.00 秒。
2024-11-25 15:39:18,625|zzb_logger : INFO 更新数据写入 0.08 秒。
2024-11-25 15:39:18,625|zzb_logger : INFO 归一化完成任务ID:201917
2024-11-25 15:39:18,625|zzb_logger : INFO 任务 201917 完成耗时356.04 秒。
2024-11-25 15:39:18,811|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
2024-11-25 15:39:18,811|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-11-25 15:39:18,968|zzb_logger : INFO 任务 201917 完成

1070
zzb_data_word/main.py Normal file

File diff suppressed because it is too large Load Diff

151
zzb_data_word/main_1.py Normal file
View File

@ -0,0 +1,151 @@
import camelot
import re
from multiprocessing import Pool
import os, time, random
import json
from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
from datetime import datetime
# 读取PDF
import PyPDF2
# 分析PDF的layout提取文本
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import mysql.connector
import db_service
from multiprocessing import Process
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
import utils
def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num):
for range in tables_range[page_num]:
if top < range['top'] and top > range['buttom']:
return True
return False
def get_text_type(text: str):
text = re.sub(r"\s", "", text)
first_re = '年度报告'
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
if re.search(first_re, text.strip()):
return 'page_header'
if page_number_pattern.match(text.strip()):
return 'page_footer'
if len(text) < 20 and text.endswith(''):
return 'page_footer'
return 'text'
# 读取pdf文件中文本内容不包括表格
def get_text_content(pdf_path,file_id,tables_range,conn,cursor):
"""
:return: 返回pdf文件中文本内容不包括表格
"""
# 我们从PDF中提取页面,page_numbers=[4,5,6]
for pagenum, page in enumerate(extract_pages(pdf_path)):
try:
# 找到所有的元素
page_elements = [(element.y1, element) for element in page._objs]
# 查找组成页面的元素
for i,component in enumerate(page_elements):
try:
# 提取页面布局的元素
element = component[1]
# 检查该元素是否为文本元素
if isinstance(element, LTTextBoxHorizontal):
# element_top = element.bbox[3]
print(element)
line_text = element.get_text().replace('\n','')
line_text = re.sub(r"\s", "", line_text)
if delete_flag(line_text):
continue
# if not text_in_table(element_top, tables_range, pagenum+1):
db_service.insert_pdf_text_info({
'file_id': file_id,
'page_num' : pagenum+1,
'text' : line_text
},conn,cursor)
except Exception as e:
print(f'{pagenum}{i}处理异常')
print(e)
except Exception as e:
print(f'{pagenum}页处理异常')
print(e)
def delete_flag(text : str):
if utils.under_non_alpha_ratio(text):
return True
if not re.findall(',||。|、||',text):
return True
if text.find('适用') != -1 and text.find('不适用') != -1:
return True
if text.find('') != -1 and text.find('') != -1:
return True
return False
def get_table_range(file_path, file_id, pages, tables_range):
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
start = time.time()
conn = mysql.connector.connect(
host= MYSQL_HOST,
user= MYSQL_USER,
password= MYSQL_PASSWORD,
database= MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
for t in tables:
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
get_text_content(file_path, file_id, tables_range, conn, cursor)
cursor.close()
conn.close()
end = time.time()
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
if __name__ == "__main__":
path = "/Users/zhengfei/Desktop/cb/002315-2023-nb-nb.pdf"
# get_text_content(path,'111')
# get_table_measure(path,'all','111')
#print(pdf_data)
# pdf_info = []
tables_range = {}
get_table_range(path, '5555', 'all', tables_range)
# sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
# pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
# for table in pdf_tables:
# print(table)#修改测试

785
zzb_data_word/main_word.py Normal file
View File

@ -0,0 +1,785 @@
import re
import os,time
from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP
import mysql.connector
import utils
from pymilvus import MilvusClient
import numpy as np
from multiprocessing import Process
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
import redis
import db_service_word
from zzb_logger import applog
'''
已知发现问题
1.表格和文本提取错误表格和文本内容在同一页文本在前表格在后的文本数据提取不出来
2.大模型抽取错抽取2023年营业收入主营业务收入分产品的营业收入变动比例被错误抽取
3.表格中的指标被抽取成文本中
4.大模型抽取指标时语义完全不同的指标被放一起考虑用向量相似度来判断
'''
# 数据处理流程
# 1. get_table_range多进程获取所有表格及表格上下文输出为一个完整的列表
# 2. 单进程进行表格分页合并,输出一个新的表格对象数组
# 3. 新表格对象数组多进程开始原来的解析指标流程
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
MUILT_PATTERN = '调整前'
#unit_pattern = re.compile(r'单位[|:]?(百万元|千万元|亿元|万元|千元|元)')
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
#获取指标的表头信息
def get_col_num_info(array,row_num,col_num,x,y):
num_info=""
for j in range(col_num):
if len(str(array[x][j])) > 50:
continue
num_info += str(array[x][j])
return num_info.replace('%','')
#获取指标的表头信息
def get_row_num_info(array,row_num,col_num,x,y):
num_info=""
for i in range(row_num):
if len(str(array[i][y])) > 50:
continue
num_info += str(array[i][y])
return num_info
def table_converter(table):
table_string = ''
# 遍历表格的每一行
for row_num in range(len(table)):
row = table[row_num]
# 从warp的文字删除线路断路器
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
# 将表格转换为字符串,注意'|'、'\n'
table_string+=(','.join(cleaned_row))
# 删除最后一个换行符
table_string = table_string[:-1]
return table_string
# 检查第二列是否为中文字符的函数
def is_chinese(s):
return bool(re.search('[\u4e00-\u9fff]', s))
def check_table(arr):
split_index = None
for i in range(arr.shape[0]):
# 过滤掉第一行
if arr[i, 0] == "" and is_chinese(arr[i, 1]) and i > 1:
split_index = i
break
if split_index is not None:
arr1 = arr[:split_index]
arr2 = arr[split_index:]
return [arr1, arr2]
else:
return [arr]
def safe_process_array(func, arr):
try:
return func(arr)
except Exception as e:
print(f"这个函数出现了报错{func.__name__}: {e}")
return arr # 返回原数组以便继续后续处理
# 单独针对三季报的资产负债表识别合并问题
def process_array(arr, years=['2022', '2023', '2024'], keyword='项目'):
# 确保 row 有足够的列来存储分割后的数据
def ensure_columns(row, num_columns):
while len(row) < num_columns:
row.append('')
def is_valid_header(header, years, keyword):
header_text = header.lower() # 转小写以提高匹配的鲁棒性
return any(year in header_text for year in years) and keyword in header_text
# 对字符串进行清理
def clean_text(text):
# 去除“年”和“月”相邻的空格
text = re.sub(r'\s*(年|月)\s*', r'\1', text)
# 去除“日”左侧相邻的空格
text = re.sub(r'\s*日', '', text)
return text
# 将 numpy 数组转换为列表
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
if len(arr[0]) == 1 and is_valid_header(arr[0][0], years, keyword):
remaining_value = arr[0][0]
# 清理字符串
remaining_value = clean_text(remaining_value)
parts = remaining_value.split()
ensure_columns(arr[0], len(parts))
for i in range(len(parts)):
arr[0][i] = parts[i]
header_columns = len(arr[0])
for i in range(1, len(arr)):
if len(arr[i]) == 1:
remaining_value = arr[i][0]
parts = remaining_value.split()
if len(parts) > header_columns:
parts = parts[:header_columns]
ensure_columns(arr[i], header_columns)
for j in range(len(parts)):
arr[i][j] = parts[j]
# 如果分割出的值不足,填充空值
if len(parts) < header_columns:
for j in range(len(parts), header_columns):
arr[i][j] = ''
return arr
# 三季报中针对性修改,本报告期和年初至报告期末的两个上年同期进行区分
def process_array_with_annual_comparison(arr, keywords=['本报告期', '年初至报告期末', '上年同期']):
def contains_all_keywords(header, keywords):
return all(keyword in header for keyword in keywords)
def split_and_replace_occurrences(header, target, replacement):
# 找到所有 target 出现的位置
indices = [i for i, x in enumerate(header) if x == target]
if len(indices) > 1:
split_index = len(indices) // 2
for i in range(split_index):
header[indices[i]] = replacement
return header
# 将 numpy 数组转换为列表
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
if len(arr) > 0 and len(arr[0]) > 0:
first_row = arr[0]
if contains_all_keywords(first_row, keywords):
# 将 "上年同期" 拆分并替换
first_row = split_and_replace_occurrences(first_row, '上年同期', '三季报中无需识别的上年同期')
arr[0] = first_row
return arr
# 三季报的非经常损益的单独处理
def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期'], target='计入当期损益的政府补助',
replacement='非经常性损益'):
# 检查第一行是否包含所有关键词
def contains_all_keywords(header, keywords):
# return all(keyword in header for keyword in keywords)
return all(any(keyword in str(cell) for cell in header) for keyword in keywords)
# 检查第一列中是否存在目标文本
def contains_target_in_first_column(arr, target):
return any(target in str(item[0]) for item in arr)
# 替换第一列中的特定值
def replace_in_first_column(arr, target, replacement):
for i in range(len(arr)):
if arr[i][0] == target:
arr[i][0] = replacement
return arr
# 将 numpy 数组转换为列表
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
if len(arr) > 0 and len(arr[0]) > 0:
first_row = arr[0]
# 检查第一行和第一列的条件
if contains_all_keywords(first_row, keywords) and contains_target_in_first_column(arr, target):
# 替换第一列中的 "合计"
arr = replace_in_first_column(arr, '合计', replacement)
return arr
# 处理表格数据
def process_table(file_id, tables):
applog.info('Run task %s (%s)...' % (f'处理word文件中的table file_id:{file_id}', os.getpid()))
start = time.time()
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
for t in tables:
try:
arr = np.array(t["data"])
arr = safe_process_array(process_array, arr) # 部分资产负债表合并问题
arr = safe_process_array(process_array_with_annual_comparison, arr) # 复杂表格的优化"多个上年同期时处理"
arr = safe_process_array(process_array_with_grants, arr) # 三季报的非经常损益
arr = np.char.replace(arr, ' ', '')
arr = np.char.replace(arr, '\n', '')
arr = np.char.replace(arr, ',', '')
arr_list = check_table(arr)
for a in arr_list:
new_data = a.tolist() # 用于后面保存到数据库中
new_data = utils.check_black_table_list(new_data)
rows, cols = a.shape
if rows == 1 and cols == 1:
continue
arr_str = ''.join([''.join(map(str, row)) for row in a])
# 全量的数据先存入 word_parse_data表中
db_service_word.insert_word_parse_process({
'file_id': file_id,
'page_num': t["index"],
'page_count': 100,
'type': 'table',
'content': {
'page_num': t["index"],
'table_index': t["index"],
"type": "table",
"data": new_data,
}}, conn, cursor, "word_parse_data")
# 过滤掉不包含需抽取指标表格的文本
matches = re.findall(STR_PATTERN, arr_str)
pattern = re.findall(PATTERN, arr_str)
muilt_pattern = re.findall(MUILT_PATTERN, arr_str)
if len(matches) > 0 and len(muilt_pattern) < 5:
# if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern) < 5:
db_service_word.insert_word_parse_process({
'file_id': file_id,
'page_num': t["index"],
'page_count': 100,
'type': 'parse_table',
'content': {
'page_num': t["index"],
'table_index': t["index"],
"type": "table",
"data": new_data,
}}, conn, cursor,"word_parse_process")
except Exception as e:
applog.info(f'解析表格时出现了异常 {e} 内容为{t}')
cursor.close()
conn.close()
end = time.time()
applog.info('Task %s runs %0.2f seconds.' % (f'解析表格{file_id}', (end - start)))
def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num):
for range in tables_range[page_num]:
if top < range['top'] and top > range['buttom']:
return True
return False
def get_text_type(text: str):
text = re.sub(r"\s", "", text)
first_re = '年度报告'
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
if re.search(first_re, text.strip()):
return 'page_header'
if page_number_pattern.match(text.strip()):
return 'page_footer'
if len(text) < 20 and text.endswith(''):
return 'page_footer'
return 'text'
def check_report_type(file_id):
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
"""
:return: 返回pdf文件中文本内容不包括表格
"""
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
cursor.execute(select_year_select)
record_select = cursor.fetchall()
if record_select:
report_type = record_select[0][0]
report_year = record_select[0][1]
cursor.close()
conn.close()
return int(report_type),report_year
else:
return None
# 通过text的index 获取最近的一个table的index,并校验中间text文本的长度和数量
def get_next_table_index(text_index, texts, tables):
try:
for table in tables:
if table["index"] > text_index and table["type"] == "table":
table_index = table["index"]
total_len = sum(len(texts.get(key).get("data").replace(" " ,"")) for key in range(text_index + 1, table_index))
# 最近一个表格的索引 在10个以内
if (table_index - text_index) < 10 and total_len < 50:
# 在判断所有的字符串加起来有是否小于50个字
return table_index
else:
return text_index
except StopIteration:
applog.error("Target not found")
return text_index
#处理文本数据
def process_text_content(file_id,texts,tables,full_texts,type =0):
applog.info('Run task %s (%s)...' % (f'处理word文件中的 text file_id:{file_id}', os.getpid()))
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
"""
:return: 返回pdf文件中文本内容不包括表格
"""
report_type, report_year = check_report_type(file_id)
texts_dict = {t["index"]:t for t in full_texts}
query = "SELECT title_list,button_list FROM table_title_list WHERE report_year = %s"
cursor_dict = conn.cursor(dictionary=True)
cursor_dict.execute(query, (report_year,))
result = cursor_dict.fetchone()
title_list = result['title_list']
button_list = result['button_list']
try:
for t in texts:
line_text = t["data"]
line_text = re.sub(r"\s", "", line_text)
line_text = re.sub(r"", ":", line_text)
index = t["index"]
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
db_service_word.insert_measure_parser_info({
'file_id': file_id,
'content': get_next_table_index(index,texts_dict,tables),
'type': 'parent_com',
}, conn, cursor)
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
table_info = {}
if (utils.check_table_title_black_list(line_text, title_list)
or utils.check_table_title_black_list_button(line_text,button_list)):
db_service_word.insert_measure_parser_info({
'file_id': file_id,
'content': get_next_table_index(index,texts_dict,tables),
'type': 'table_index',
}, conn, cursor)
if utils.check_table_title_black_list_measure(line_text):
db_service_word.insert_measure_parser_info_measure({
'file_id': file_id,
'content': get_next_table_index(index, texts_dict,tables),
'type': 'measure_index',
}, conn, cursor, line_text)
if re.findall(unit_pattern, line_text):
# 为单位
table_info = get_table_unit_info(file_id,line_text,t["index"],t["index"]+1)
db_service_word.insert_table_unit_info_v1(table_info,conn,cursor)
if utils.check_table_title_black_list_measure(line_text):
db_service_word.insert_measure_parser_info_measure({
'file_id': file_id,
'content': f"{t['index']}_1",
'type': 'measure_index',
}, conn, cursor, line_text)
if not utils.pdf_text_flag(line_text):
if utils.check_line_text(line_text):
db_service_word.insert_word_parse_process({
'file_id': file_id,
'page_num' : t["index"],
'page_count' : 100,
'type' : 'parse_table',
'content':{
'page_num' : t["index"],
'table_index' : t["index"],
"type" : "text",
'content' : line_text,
}},conn,cursor,"word_parse_process")
# 给慎用词校验用
db_service_word.insert_word_parse_process({
'file_id': file_id,
'page_num': t["index"],
'page_count': 100,
'type': 'text',
'content': {
'page_num': t["index"],
'table_index': t["index"],
"type": "text",
'content': line_text,
}}, conn, cursor, "word_parse_data")
table_name = "word_text_info"
if type == 1:
table_name = "id_text_info"
# 写入数据库 传入表名
db_service_word.batch_insert_page_text({
'file_id': file_id,
'page_num' : t["index"],
'text' : line_text
},conn,cursor, table_name)
except Exception as e:
applog.error(f'文本处理异常{e}')
def get_table_unit_info(file_id,line_text,page_num,table_index):
table_info = {}
table_info['file_id'] = file_id
match = unit_pattern.search(line_text)
if match:
unit = match.group(2)
table_info['unit'] = unit
table_info['page_num'] = page_num
table_info['table_index'] = table_index
return table_info
def get_table_text_info(file_id,line_text,page_num,table_index):
table_info = {}
table_info['file_id'] = file_id
table_info['text_info'] = line_text
table_info['page_num'] = page_num
table_info['table_index'] = table_index
return table_info
# 读取pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
def get_table_measure(file_id, word_tables, record_range):
"""
:return: pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
"""
try:
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
conn_app = mysql.connector.connect(
host = MYSQL_HOST_APP,
user = MYSQL_USER_APP,
password = MYSQL_PASSWORD_APP,
database = MYSQL_DB_APP
)
# 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True)
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
cursor.execute(select_year_select)
record_select = cursor.fetchall()
report_type = record_select[0][0]
report_year = record_select[0][1]
client = MilvusClient(
uri= MILVUS_CLIENT
)
applog.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
start = time.time()
record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1]
for index in range(int(record_start),int(record_end)):
t = word_tables[index]
measure_obj =[]
data_dict = {}
measure_list = []
try:
arr = np.array(t['data'])
rows, cols = arr.shape
if rows == 1 and cols == 1:
continue
row_num , col_num = -1 , -1
# 使用嵌套循环遍历数组,获取第一个数值位置
for i in range(rows):
for j in range(cols):
if j == 0 or i == 0:#防止第一列识别出数字
continue
measure_value_config = str(arr[i, j]).replace('(','').replace(')','')
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config):
if j == cols-1:
row_num, col_num = i, j
break
elif (re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config)
or measure_value_config == '-'):
row_num, col_num = i, j
break
else:
continue
break
# 遍历数值二维数组,转成带语义的指标
if row_num != -1 and col_num != -1:
for i in range(row_num,arr.shape[0]):
for j in range(col_num,arr.shape[1]):
measure_value = str(arr[i, j]).replace('%','').replace('(','-').replace(')','')
if measure_value == '-' or measure_value == '' or len(measure_value) > 20:
continue
else:
row_num_info = get_row_num_info(arr,row_num,col_num,i,j)
col_num_info = get_col_num_info(arr,row_num,col_num,i,j)
#如果上表头为空则认为是被截断,除了研发投入特殊处理其它过滤
if row_num_info in ('','-',')',''):
continue
#特殊处理非经常性损益合计和非经常性损益净额同时出现时保留净额
if col_num_info == '非经常性损益合计':
continue
if utils.check_pdf_measure_black_list(f"{col_num_info}{row_num_info}"):
continue
#去掉没有周期的指标
if utils.check_pdf_measure(f"{col_num_info}{row_num_info}"):
continue
#判断上表头和左表头周期是否一致,不一致过滤
row_period = utils.get_period_type_other(row_num_info, report_year)
col_period = utils.get_period_type_other(col_num_info, report_year)
if(row_period != col_period and row_period != 'c_n' and col_period != 'c_n'):
continue
units_mapping = {
"百万元": "百万元",
"千万元": "千万元",
"亿元": "亿元",
"万元": "万元",
"千元": "千元",
"": "",
"元/股": ""
}
row_num_info = row_num_info.replace('%','增减')
#num_info = f"{col_num_info}{row_num_info}".replace('','').replace('加:','').replace('减:','').replace('%','')
num_info = utils.get_clean_text(f"{row_num_info}{col_num_info}")
num_info_bak = utils.get_clean_text(f"{col_num_info}{row_num_info}")
measure_unit = ''
#"%": "同期增减"
combined_info = f"{row_num_info} {col_num_info}"
# for unit in units_mapping:
# if unit in row_num_info:
# measure_unit = units_mapping[unit]
# break
if utils.get_percent_flag(row_num_info) == '1':
measure_unit = ''
else:
for unit in units_mapping:
if re.search(rf'\\s*{unit}(\s*人民币)?\s*\|\(\s*{unit}(\s*人民币)?\s*\)', combined_info) or (re.search(rf'{unit}', combined_info) and any(re.search('单位', item) for item in arr[0])):
measure_unit = units_mapping[unit]
break
measure_list.append({
'measure_name': num_info,
'measure_value': measure_value,
'measure_unit':measure_unit,
})
measure_list.append({
'measure_name': num_info_bak,
'measure_value': measure_value,
'measure_unit':measure_unit,
})
if not redis_client.exists(f'parsed_measure_count_{file_id}'):
redis_client.set(f'parsed_measure_count_{file_id}', 0)
redis_client.incr(f'parsed_measure_count_{file_id}')
if len(measure_list) > 0:
data_dict["measure_list"] = measure_list
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
data_dict['file_id'] = file_id
measure_obj.append(data_dict)
db_service_word.insert_measure_data_to_milvus(client,measure_obj,cursor_app,conn_app)
except Exception as e:
applog.error(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
applog.error(f"错误是:{e}")
end = time.time()
applog.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
except Exception as e:
applog.error(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1]
for index in range(int(record_start),int(record_end)):
t = word_tables[index]
try:
arr = np.array(t['data'])
except Exception as e:
applog.error(f'这个错误是{e}的arr的值是{arr}')
finally:
redis_client.close()
client.close()
cursor.close()
conn.close()
cursor_app.close()
conn_app.close()
#指标归一化处理
def update_measure_data(file_id,file_path,parent_table_pages):
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
# #通过向量查询指标
conn_app = mysql.connector.connect(
host = MYSQL_HOST_APP,
user = MYSQL_USER_APP,
password = MYSQL_PASSWORD_APP,
database = MYSQL_DB_APP
)
# 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True)
applog.info(f'目录黑名单为:{parent_table_pages}')
db_service_word.delete_to_run(conn,cursor,file_id)
db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
# #指标归一化处理
db_service_word.update_ori_measure(conn,cursor,file_id)
# db_service.delete_database(conn_app,cursor_app,file_id)
cursor.close()
conn.close()
cursor_app.close()
conn_app.close()
def merge_consecutive_arrays(word_info):
merged_objects = []
for info_obj in word_info:
try:
if info_obj['type'] == 'table':
# 如果对象是表格,将其元素添加到临时列表中
merged_objects.append(info_obj)
except Exception as e:
applog.error(f"解析数据错误: {e}")
return merged_objects
def merge_consecutive_arrays_v1(pdf_info):
merged_objects = []
temp_array = {}
def is_same_dimension(data1, data2):
# 检查两个表的每行长度是否相同
if len(data1) != len(data2):
return False
return all(len(row1) == len(row2) for row1, row2 in zip(data1, data2))
for info_obj in pdf_info:
try:
if info_obj['type'] == 'table':
if not temp_array:
# 如果临时列表为空,则初始化临时列表
temp_array = info_obj
else:
# 检查当前表与临时列表中的表是否同维度
if is_same_dimension(temp_array['data'], info_obj['data']):
# 如果是同维度,则合并数据
temp_array['data'].extend(info_obj['data'])
else:
# 如果不是同维度,将现有临时列表添加到结果中,并重置临时列表
merged_objects.append(temp_array)
temp_array = info_obj
else:
# 如果对象不是表格,检查临时列表是否非空
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
except Exception as e:
applog.error(f"解析数据错误: {e}")
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
if temp_array:
merged_objects.append(temp_array)
return merged_objects
def start_table_measure_job(file_id):
conn_app = mysql.connector.connect(
host = MYSQL_HOST_APP,
user = MYSQL_USER_APP,
password = MYSQL_PASSWORD_APP,
database = MYSQL_DB_APP
)
# 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True)
select_process_query = '''
select DISTINCT content from word_parse_process WHERE file_id = '{file_id}' and type='parse_table' order by page_num
'''.format(file_id=file_id)
cursor_app.execute(select_process_query)
records = cursor_app.fetchall()
word_info = []
for record in records:
word_info.append(eval(record[0]))
# 获取table 数据
word_tables = merge_consecutive_arrays(word_info)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client.set(f'measure_count_{file_id}', len(word_tables))
cursor_app.close()
conn_app.close()
redis_client.close()
records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
processes = []
for record_range in records_range_parts:
# get_table_measure(file_id,word_tables,record_range,)
p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
processes.append(p)
p.start()
for p in processes:
p.join()

File diff suppressed because it is too large Load Diff

4
zzb_data_word/nohup.out Normal file
View File

@ -0,0 +1,4 @@
/Users/zhengfei/opt/anaconda3/envs/py310/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
/Users/zhengfei/opt/anaconda3/envs/py310/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '

4164
zzb_data_word/not_match.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,57 @@
from docx import Document
from pymilvus import MilvusClient
import requests
import json,time,os
directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'
client = MilvusClient(
uri= 'http://114.55.128.195:19530'
)
# 遍历目录
for filename in os.listdir(directory_path):
# 构建完整的文件路径
try:
file_path = os.path.join(directory_path, filename)
# 加载Word文档
print(file_path)
doc = Document(file_path)
text = ''
# 读取文档中的所有段落
i=0
data = []
# for para in doc.paragraphs:
for num in range(200, len(doc.paragraphs)-200):
# 将段落文本添加到当前段落片段中
try:
text += doc.paragraphs[num].text
# 当当前段落片段长度超过500时将其添加到数组中并重置当前段落片段
if len(text) > 500:
i += 1
response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
res_json = json.loads(response.text)
if res_json["code"] == 200:
vector = res_json["data"][0]
measure_data = {}
measure_data['vector'] = vector
measure_data['text'] = text
measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
data.append(measure_data)
text = ''
if (i > 20 or num == len(doc.paragraphs)-200-1):
res = client.insert(
collection_name="ydkf",
data=data
)
i = 0
data = []
except Exception as e:
print(e)
except Exception as e:
print(e)

269
zzb_data_word/parse_word.py Normal file
View File

@ -0,0 +1,269 @@
from docx import Document
import json
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from lxml import etree
import os
import zipfile
RESULT_TYPE_TEXT = 'text'
RESULT_TYPE_TABLE = 'table'
def build_result(result_type, index, data):
return {
'type': result_type,
'index': index,
'data': data
}
def build_catalog_result(index, depth, data):
return {
'index': index,
'depth': depth,
'data': data
}
# 解析docx文件中的XML内容
def get_xml_content(docx_filename, xml_filename):
with zipfile.ZipFile(docx_filename) as z:
return z.read(xml_filename)
def parse_paragraph(paragraph, index, namespaces):
paragraph_text = paragraph.text.strip() if paragraph else ''
if paragraph_text:
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
return None
def parse_table(table, index):
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
return build_result(RESULT_TYPE_TABLE, index, table_data)
def parse_paragraph_element(paragraph_element, index, namespaces):
paragraph_xml = etree.fromstring(paragraph_element.xml)
paragraph_text = ''.join(paragraph_xml.xpath('//w:t/text()', namespaces=namespaces)).strip()
if paragraph_text:
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
return None
def parse_table_element(table_element, index, namespaces):
table_xml = etree.fromstring(table_element.xml)
table_data = []
for row in table_xml.xpath('//w:tr', namespaces=namespaces):
row_data = []
for cell in row.xpath('./w:tc | ./w:sdt', namespaces=namespaces):
cell_text = ''.join(cell.xpath('.//w:t/text()', namespaces=namespaces)).strip()
grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
if grid_span > 1:
row_data.extend([cell_text] * grid_span)
else:
row_data.append(cell_text)
table_data.append(row_data)
return build_result(RESULT_TYPE_TABLE, index, table_data)
def add_to_catalog(element_xml, index, catalog_content, namespaces, paragraph_text, heading_styles):
p_element = etree.fromstring(element_xml)
# outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
# if outlineLvl:
# level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
# catalog_content.append(build_catalog_result(index, level, paragraph_text))
level = is_heading_paragraph(p_element, heading_styles, namespaces)
if level != -1:
catalog_content.append(build_catalog_result(index, level, paragraph_text))
# 检查段落是否为标题样式
def is_heading_paragraph(paragraph, heading_styles, namespaces):
pPr = paragraph.find('.//w:pPr', namespaces=namespaces)
if pPr is not None:
pStyle = pPr.find('.//w:pStyle', namespaces=namespaces)
pOutLineLvl = pPr.find('.//w:outlineLvl', namespaces=namespaces)
if pStyle is not None:
style_val = pStyle.get(f"{{{namespaces['w']}}}val")
if style_val.isdigit():
return int(style_val)
if pOutLineLvl is not None:
outLineLvl_val = pOutLineLvl.get(f"{{{namespaces['w']}}}val")
if outLineLvl_val.isdigit():
return int(outLineLvl_val) + 1
# if pStyle is not None and pStyle.get(ns['w'] + 'val') in heading_styles:
# if style_val > 0:
# return True
return -1
def get_paragraph_text(paragraph_element, namespaces):
paragraph_text = ''
for run in paragraph_element.findall('.//w:r', namespaces=namespaces):
for text in run.findall('.//w:t', namespaces=namespaces):
paragraph_text += text.text if text.text is not None else ''
return paragraph_text
def add_to_catalog_paragraph(text, index, catalog_content, namespaces):
# 添加段落到目录
catalog_content.append(build_catalog_result(index, 1, text)) # 假设默认级别为1
def parse_sdt_catalog(sdt_element, catalog_content, index, namespaces):
sdt_content = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
if sdt_content is not None:
for child in sdt_content:
if child.tag.endswith('p'): # 内容控件中的段落
paragraph_text = get_paragraph_text(child, namespaces)
if paragraph_text.strip(): # 检查文本是否为空
add_to_catalog_paragraph(paragraph_text, index, catalog_content, namespaces)
index += 1 # 更新索引
elif child.tag.endswith('tbl'): # 内容控件中的表格
# 处理表格内容(如果需要)
pass
elif child.tag.endswith('sdt'): # 嵌套的内容控件
index = parse_sdt_catalog(child, catalog_content, index, namespaces) # 递归解析嵌套的内容控件
return index
def parse_docx(docx_path):
try:
document = Document(docx_path)
styles_xml = get_xml_content(docx_path, 'word/styles.xml')
except Exception as e:
print(f"Error loading document: {e}")
return None, None
doc_content = [] # 内容(文本+表格)
catalog_content = [] # 目录
current_index = 1 # 维护全局的 index 变量
paragraph_index = 0
table_index = 0
# 获取整个文档的XML内容
xml_root = document.part.element
namespaces = xml_root.nsmap
# 获取所有标题样式
styles_root = etree.fromstring(styles_xml)
heading_styles = set()
for style in styles_root.xpath('//w:style', namespaces=namespaces):
style_type = style.get(namespaces['w'] + 'type')
if style_type == 'paragraph' and style.get(namespaces['w'] + 'styleId').startswith('Heading'):
heading_styles.add(style.get(namespaces['w'] + 'styleId'))
# 遍历文档中的所有元素
for i, element in enumerate(document.element.body):
if isinstance(element, CT_P): # 段落
paragraph_result = parse_paragraph_element(element, current_index, namespaces)
if paragraph_result:
doc_content.append(paragraph_result)
# 判断是否为目录,是就插入目录内容
paragraph = document.paragraphs[paragraph_index]
add_to_catalog(paragraph._element.xml, current_index, catalog_content, namespaces, paragraph.text, heading_styles)
current_index += 1 # 更新 index
paragraph_index += 1
elif isinstance(element, CT_Tbl): # 表格
table_result = parse_table_element(element, current_index, namespaces)
if table_result:
doc_content.append(table_result)
current_index += 1 # 更新 index
table_index += 1
elif element.tag.endswith('sdt'): # 内容控件
current_index = parse_sdt(element, doc_content, current_index, namespaces, catalog_content, heading_styles) # 更新索引
return json.dumps(doc_content, indent=4, ensure_ascii=False), json.dumps(catalog_content, indent=4, ensure_ascii=False)
def parse_sdt(sdt_element, doc_content, current_index, namespaces, catalog_content, heading_styles):
sdtContent = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
if sdtContent is not None:
for child in sdtContent:
if child.tag.endswith('p'): # 内容控件中的段落
paragraph_text = ''
for run in child.findall('.//w:r', namespaces=namespaces):
for text in run.findall('.//w:t', namespaces=namespaces):
paragraph_text += text.text if text.text is not None else ''
if paragraph_text.strip(): # 检查文本是否为空
doc_content.append(build_result(RESULT_TYPE_TEXT, current_index, paragraph_text.strip()))
# 判断是否为目录,是就插入目录内容
add_to_catalog(child.xml, current_index, catalog_content, namespaces, paragraph_text, heading_styles)
current_index += 1 # 更新索引
elif child.tag.endswith('tbl'): # 内容控件中的表格
table_data = []
merged_cells = {} # 用于记录跨行单元格的信息
for row_idx, row in enumerate(child.findall('.//w:tr', namespaces=namespaces)):
row_data = []
for col_idx, cell in enumerate(row.findall('.//w:tc', namespaces=namespaces)):
cell_text = ''
for run in cell.findall('.//w:r', namespaces=namespaces):
for text in run.findall('.//w:t', namespaces=namespaces):
cell_text += text.text if text.text is not None else ''
# 检查单元格是否跨列
grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
if grid_span > 1:
row_data.extend([cell_text.strip()] * grid_span)
else:
row_data.append(cell_text.strip())
# 检查单元格是否跨行
v_merge_xpath = etree.XPath('.//w:tcPr/w:vMerge/@w:val', namespaces=namespaces)
v_merge = v_merge_xpath(cell)
if v_merge and v_merge[0] == 'restart':
merged_cells[(row_idx, col_idx)] = (int(grid_span), 1)
elif v_merge and v_merge[0] == 'continue':
if (row_idx - 1, col_idx) in merged_cells:
merged_cells[(row_idx - 1, col_idx)] = (merged_cells[(row_idx - 1, col_idx)][0], merged_cells[(row_idx - 1, col_idx)][1] + 1)
# 跨行单元格不需要再次添加到 row_data 中
else:
# 只有非跨行单元格才需要添加到 row_data 中
pass
# 处理跨行单元格
for (r, c), (col_span, row_span) in list(merged_cells.items()):
if r < row_idx:
for i in range(row_span):
if r + i == row_idx:
row_data[c:c] = [row_data[c]] * (col_span - 1)
break
if r + row_span - 1 == row_idx:
del merged_cells[(r, c)]
table_data.append(row_data)
if table_data: # 检查表格数据是否为空
doc_content.append(build_result(RESULT_TYPE_TABLE, current_index, table_data))
current_index += 1 # 更新索引
elif child.tag.endswith('sdt'): # 嵌套的内容控件
current_index = parse_sdt(child, doc_content, current_index, namespaces, catalog_content, heading_styles) # 递归解析嵌套的内容控件
return current_index # 返回更新后的索引
def split_text_table(json_data):
# 分组
text_elements = [element for element in json_data if element['type'] == 'text']
table_elements = [element for element in json_data if element['type'] == 'table']
# 转换为JSON字符串
text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
return text_elements_json, table_elements_json
def append_to_file(file_path, text):
try:
with open(file_path, 'a', encoding='utf-8') as file:
file.write(text + '\n')
except Exception as e:
print(f"Error writing to file: {e}")
if __name__ == "__main__":
current_directory = os.getcwd()
docx_relative_path = '101.docx'
file_relative_path = 'file\\docx\\test1.txt'
docx_path = os.path.join(current_directory, docx_relative_path)
file_path = os.path.join(current_directory, file_relative_path)
try:
parsed_content, catalog_content = parse_docx(docx_path)
if parsed_content and catalog_content:
json_parsed_content = json.loads(parsed_content)
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
append_to_file(file_path, text_elements_json)
append_to_file(file_path, table_elements_json)
append_to_file(file_path, catalog_content)
except Exception as e:
print(f"Error parse_docx: {e}")

View File

@ -0,0 +1,934 @@
import camelot
import re
from multiprocessing import Pool
import os, time, random
import json
from config_p import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP
from datetime import datetime
# 读取PDF
import PyPDF2
# 分析PDF的layout提取文本
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import mysql.connector
import utils
from pymilvus import MilvusClient
import llm_service
import db_service
import pdf_title
import numpy as np
from multiprocessing import Process
from config_p import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
import redis
'''
已知发现问题
1.表格和文本提取错误表格和文本内容在同一页文本在前表格在后的文本数据提取不出来
2.大模型抽取错抽取2023年营业收入主营业务收入分产品的营业收入变动比例被错误抽取
3.表格中的指标被抽取成文本中
4.大模型抽取指标时语义完全不同的指标被放一起考虑用向量相似度来判断
'''
# 数据处理流程
# 1. get_table_range多进程获取所有表格及表格上下文输出为一个完整的列表
# 2. 单进程进行表格分页合并,输出一个新的表格对象数组
# 3. 新表格对象数组多进程开始原来的解析指标流程
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
MUILT_PATTERN = '调整前'
#unit_pattern = re.compile(r'单位[|:]?(百万元|千万元|亿元|万元|千元|元)')
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
#获取指标的表头信息
def get_col_num_info(array,row_num,col_num,x,y):
num_info=""
for j in range(col_num):
if len(str(array[x][j])) > 50:
continue
num_info += str(array[x][j])
return num_info.replace('%','')
#获取指标的表头信息
def get_row_num_info(array,row_num,col_num,x,y):
num_info=""
for i in range(row_num):
if len(str(array[i][y])) > 50:
continue
num_info += str(array[i][y])
return num_info
def table_converter(table):
table_string = ''
# 遍历表格的每一行
for row_num in range(len(table)):
row = table[row_num]
# 从warp的文字删除线路断路器
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
# 将表格转换为字符串,注意'|'、'\n'
table_string+=(','.join(cleaned_row))
# 删除最后一个换行符
table_string = table_string[:-1]
return table_string
def get_table_range(file_path, file_id, pages, tables_range):
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
start = time.time()
conn = mysql.connector.connect(
host= MYSQL_HOST,
user= MYSQL_USER,
password= MYSQL_PASSWORD,
database= MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
conn_app = mysql.connector.connect(
host= MYSQL_HOST_APP,
user= MYSQL_USER_APP,
password= MYSQL_PASSWORD_APP,
database= MYSQL_DB_APP
)
cursor_app = conn_app.cursor(buffered=True)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
try:
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
for t in tables:
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
arr = np.array(t.data)
if len(arr[0]) == 6 and arr[0][0]== "项目" and arr[0][1] == '' and '2022' in arr[0][2] and '2021' in arr[0][2]:
remaining_value = arr[0][2]#initial_value.replace("项目", "", 1)
split_index = len(remaining_value) // 2
arr[0][1] = remaining_value[:split_index]
arr[0][2] = remaining_value[split_index:]
if len(arr[0]) == 4 and all(value == arr[0][0] for value in arr[0]) and all("项目" in arr[0][0] and "附注" in arr[0][0] for value in arr[0]):
initial_value = arr[0][0]
project_value = "项目"
note_value = "附注"
remaining_value = initial_value.replace("项目", "", 1).replace("附注", "", 1)
split_index = len(remaining_value) // 2
first_half = remaining_value[:split_index]
second_half = remaining_value[split_index:]
# 判断 "项目" 在 original_value 中的位置
if "项目" in initial_value and first_half in initial_value:
project_index = initial_value.index("项目")
year_index = initial_value.index(first_half)
# 判断 "项目" 是否在 first_half 的前面
if project_index > year_index:
first_half, second_half = second_half, first_half
arr[0] = [project_value, note_value, first_half, second_half]
if len(arr[0]) == 3 and all(value == arr[0][0] for value in arr[0]) and all("项目" in arr[0][0] for value in arr[0]):
initial_value = arr[0][0]
project_value = "项目"
#note_value = "附注"
remaining_value = initial_value.replace("项目", "", 1)
split_index = len(remaining_value) // 2
first_half = remaining_value[:split_index]
second_half = remaining_value[split_index:]
arr[0] = [project_value, first_half, second_half]
#for i in range(len(arr[0])):
#if arr[0][i] == arr[1][i] and len(arr[0][i])<5:
#print(f'{arr[0][i]}')
#arr[1][i] = ''
#保留camelot中的空格在这里依据空格进行手动表格拆分
#for line in arr:
for line in arr:
if not line[0].replace('.', '', 1).isdigit() and any(line[i] == line[i+1] and ' ' in line[i] for i in range(1, len(line) - 1)):
for i in range(1, len(line) - 1):
if line[i] == line[i+1] and ' ' in line[i]:
split_value = line[i]
split_parts = split_value.split(' ', 1) # 使用 split 方法进行分割
if len(split_parts) == 2: # 确保确实进行了分割
first_half, second_half = split_parts
line[i] = first_half
line[i+1] = second_half
break
#处理完之后保证arr中不再存在空格
#arr = [[item.rieplace(' ', '') for item in line] for line in arr]
arr = np.char.replace(arr, ' ', '')
#这里是防止出现表格左右拼接的情况
first_row = arr[0]
if len(first_row) % 2 == 0 and all(cell.strip() for cell in first_row):
mid_point = len(first_row) // 2
if np.array_equal(first_row[:mid_point], first_row[mid_point:]):
new_arr = []
for i in range(mid_point):
new_row = np.concatenate([arr[:, i], arr[:, i + mid_point]])
new_arr.append(new_row)
arr = np.array(new_arr).T
#这里开始对无效的表头进行处理
try:
invalid_headers = ["上年年末余额"]
non_empty_values = [value for value in first_row if value]#要求就是首行除了空值外的值都必须是一致的
if len(set(non_empty_values)) == 1 and non_empty_values[0] in invalid_headers:
arr[0] = ["表头不合规的表格"] * len(first_row)
except Exception as e:
print(f'在识别表头是否合规时出现了报错:{e}')
#这里是防止出现'2023年度2022年度'camelot识别错误
if not arr[0][0].replace('.', '', 1).isdigit() and any(arr[0][i] == arr[0][i+1] and '2023' in arr[0][i] and '2022' in arr[0][i] for i in range(1, len(arr[0])-1)):
for i in range(1, len(arr[0])-1):
if arr[0][i] == arr[0][i+1] and '2023' in arr[0][i] and '2022' in arr[0][i]:
split_value = arr[0][i]
split_index = len(split_value) // 2
first_half = split_value[:split_index]
second_half = split_value[split_index:]
arr[0][i] = first_half
arr[0][i+1] = second_half
break
#防止2023与2022同时出现
if not arr[0][0].replace('.', '', 1).isdigit():
# 遍历第一行的值
for i in range(1, len(arr[0]) - 1):
# 检查相邻的两个值是否同时包含 '2023' 和 '2022'(且 '2023' 在 '2022' 之前)
if (('2023' in arr[0][i] and '2022' in arr[0][i+1]) and
(arr[0][i].index('2023') < arr[0][i+1].index('2022'))):
# 更新这两个值
arr[0][i] = '2023年'
arr[0][i+1] = '2022年'
break
#这里开始对可能解析错误的值做判断:
for i, row in enumerate(arr):
if len(row) >= 4:
# 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
split_values = row[2].split('.')
# 确保可以正确拆分成三个数值
if len(split_values) == 4:
new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
row[1] = new_value1
row[2] = new_value2
row[3] = new_value3
#检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
split_value_3 = row[2].split('.')
split_value_5 = row[4].split('.')
if len(split_value_3) == 3:
new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
if len(split_value_5) == 3:
new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
row[1] = new_value2
row[2] = new_value3
row[3] = new_value4
row[4] = new_value5
#检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
split_values = row[2].split('.')
if len(split_values) == 3:
new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
row[1] = new_value2
row[2] = new_value3
#检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
if not row[0].replace('.', '', 1).isdigit():
for i in range(1, len(row) - 1):
if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
split_values = row[i + 1].split('%')
new_value1 = f"{split_values[0]}%"
new_value2 = f"{split_values[1]}"
row[i] = new_value1
row[i + 1] = new_value2
break
new_data = arr.tolist()#用于后面保存到数据库中
new_data = utils.check_black_table_list(new_data)
rows, cols = arr.shape
if rows == 1 and cols == 1:
continue
arr_str = ''.join([''.join(map(str, row)) for row in arr])
#过滤掉不包含需抽取指标表格的文本
matches = re.findall(STR_PATTERN, arr_str)
pattern = re.findall(PATTERN,arr_str)
muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
db_service.insert_pdf_parse_process({
'file_id': file_id,
'page_num' : page_num,
'page_count' : 100,
'type' : 'parse_table',
'content':{
'top' : top,
'buttom' : buttom,
'page_num' : page_num,
'table_index' : table_index,
"type" : "table",
"data" : new_data,
'sort_num' : page_num*1000 - top
}},conn_app,cursor_app)
except Exception as e:
print(f'camelot解析表格时出现了{e}')
get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app)
cursor.close()
conn.close()
cursor_app.close()
conn_app.close()
redis_client.close()
end = time.time()
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num):
for range in tables_range[page_num]:
if top < range['top'] and top > range['buttom']:
return True
return False
def get_text_type(text: str):
text = re.sub(r"\s", "", text)
first_re = '年度报告'
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
if re.search(first_re, text.strip()):
return 'page_header'
if page_number_pattern.match(text.strip()):
return 'page_footer'
if len(text) < 20 and text.endswith(''):
return 'page_footer'
return 'text'
# 读取pdf文件中文本内容不包括表格
def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_client, conn_app, cursor_app):
"""
:return: 返回pdf文件中文本内容不包括表格
"""
#print(f'tables_range 的值为{tables_range}')
#print('----------------')
#print(pages)
page_start = pages.split('-')[0]
page_end = pages.split('-')[1]
print(f'pages的值为{pages}')
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
cursor.execute(select_year_select)
record_select = cursor.fetchall()
report_type = record_select[0][0]
report_year = record_select[0][1]
select_pdf_text_check = f"""select count(1) from pdf_text_info where file_id = {file_id}"""
#check_if_empty_query = f"SELECT COUNT(*) FROM pdf_text_info where file_id = {file_id} and page_num = {page_num}"
cursor.execute(select_pdf_text_check)
is_empty = cursor.fetchone()[0] == 0
query = "SELECT title_list,button_list FROM table_title_list WHERE report_year = %s"
cursor_dict = conn.cursor(dictionary=True)
cursor_dict.execute(query, (report_year,))
result = cursor_dict.fetchone()
title_list = result['title_list']
button_list = result['button_list']
# 我们从PDF中提取页面,page_numbers=[4,5,6]
for pagenum, page in enumerate(extract_pages(pdf_path)):
try:
if pagenum+1 < int(page_start) or pagenum+1 > int(page_end):
continue
#更新redis已解析页码
if not redis_client.exists(f'parsed_page_count_{file_id}'):
redis_client.set(f'parsed_page_count_{file_id}', 0)
redis_client.incr(f'parsed_page_count_{file_id}')
# 找到所有的元素
page_elements = [(element.y1, element) for element in page._objs]
# 查找组成页面的元素
line_texts = []
#if not utils.pdf_text_flag(line_text):
# line_texts.append(line_text)
for i,component in enumerate(page_elements):
# 提取页面布局的元素
element = component[1]
# 检查该元素是否为文本元素
if isinstance(element, LTTextBoxHorizontal):
# 检查文本是否出现在表中
line_text = element.get_text().replace('\n','')
line_text = re.sub(r"\s", "", line_text)
#提取符合要求的文本写入pdf_text_info用于文本书写错误识别
if not utils.pdf_text_flag(line_text):
line_texts.append(line_text)
#db_service.insert_pdf_text_info({
# 'file_id': file_id,
# 'page_num' : pagenum+1,
# 'text' : line_text
# },conn,cursor)
element_top = element.bbox[3]
element_buttom = element.bbox[1]
out_table_list = ['母公司现金流量表','母公司利润表','母公司资产负债表','子公司']
# 检查该文本是否出现在表中
if tables_range.get(pagenum+1):
for range in tables_range[pagenum+1]:
if element_top < range['top'] and element_top > range['buttom']:#总是有母公司表被识别到上一个表里面:
pass
else:
if element_top - range['top'] < 150 and element_top - range['top'] > 5 and (not text_in_table(element_top, tables_range, pagenum+1) or any(word in line_text for word in out_table_list)):#or any(word in line_text for word in out_table_list)
text_type = get_text_type(line_text)
if text_type in ('page_header','page_footer'):
break
if pagenum ==44:
print(f'line_text在第44页的值有{line_text}')
#这个对一整页都有用,会去掉很多正确的表
# 记录需要过滤掉的页码
if len(re.findall('母公司|现金流量表补充', line_text)) > 0 :
db_service.insert_measure_parser_info({
'file_id': file_id,
'content': pagenum+1,
'type': 'parent_com',
},conn_app,cursor_app)
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
table_info = {}
if utils.check_table_title_black_list(line_text,title_list):
db_service.insert_measure_parser_info({
'file_id': file_id,
'content': f"{range['page_num']}_{range['table_index']}",
'type': 'table_index',
},conn_app,cursor_app)
if utils.check_table_title_black_list_measure(line_text):
db_service.insert_measure_parser_info_measure({
'file_id': file_id,
'content': f"{range['page_num']}_{range['table_index']}",
'type': 'measure_index',
},conn_app,cursor_app,line_text)
if re.findall(unit_pattern, line_text):
range['unit_flag'] = True
table_info = get_table_unit_info(file_id,line_text,range['page_num'],range['table_index'])
db_service.insert_table_unit_info_v1(table_info,conn,cursor)
# if utils.check_table_title_black_list(line_text):
# db_service.insert_measure_parser_info({
# 'file_id': file_id,
# 'content': f"{range['page_num']}_{range['table_index']}",
# 'type': 'table_index',
# },conn,cursor)
else:
if len(line_text) <= 5 or len(re.findall('单位|适用', line_text)) > 0 :
pass
#else:
# table_info = get_table_text_info(file_id,line_text,range['page_num'],range['table_index'])
# db_service.insert_table_text_info(table_info,conn,cursor)
#通过关键词黑名单匹配表格上方的文本区域,提取需要过滤的表格
# if utils.check_table_title_black_list(line_text):
# db_service.insert_measure_parser_info({
# 'file_id': file_id,
# 'content': f"{range['page_num']}_{range['table_index']}",
# 'type': 'table_index',
# },conn,cursor)
if utils.check_line_text(line_text):
db_service.insert_pdf_parse_process({
'file_id': file_id,
'page_num' : pagenum+1,
'page_count' : 100,
'type' : 'parse_table',
'content':{
'top' : element_top,
'buttom' : element_buttom,
'page_num' : range['page_num'],
'table_index' : range['table_index'],
"type" : text_type,
'content' : line_text,
'sort_num' : range['page_num']*1000 - element_top
}},conn_app,cursor_app)
break
#处理母公司表格标题在页面底部,完整表格在下一页
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
text_type = get_text_type(line_text)
if text_type == 'page_footer':
continue
table_info = {}
# 记录需要过滤掉的页码
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
db_service.insert_measure_parser_info({
'file_id': file_id,
'content': pagenum+2,
'type': 'parent_com',
},conn_app,cursor_app)
#通过关键词黑名单匹配本页面末尾文字,如果出现
if utils.check_table_title_black_list_button(line_text,button_list):
db_service.insert_measure_parser_info({
'file_id': file_id,
'content': f"{pagenum+2}_1",
'type': 'table_index',
},conn_app,cursor_app)
if utils.check_table_title_black_list_measure(line_text):
db_service.insert_measure_parser_info_measure({
'file_id': file_id,
'content': f"{pagenum+2}_1",
'type': 'measure_index',
},conn_app,cursor_app,line_text)
if re.findall(unit_pattern, line_text):
table_info = get_table_unit_info(file_id,line_text,pagenum+2,1)
db_service.insert_table_unit_info(table_info,conn,cursor)
if utils.check_line_text(line_text):
db_service.insert_pdf_parse_process({
'file_id': file_id,
'page_num' : pagenum+1,
'page_count' : 100,
'type' : 'parse_table',
'content':{
'top' : element_top,
'buttom' : element_buttom,
'page_num' : pagenum+1,
"type" : text_type,
'content' : line_text,
'sort_num' : (pagenum+1)*1000 - element_top
}},conn_app,cursor_app)
if is_empty:
db_service.batch_insert_page_text_nocheck({
'file_id': file_id,
'page_num' : pagenum+1,
'text' : line_texts
},conn,cursor)
#print('文本这里没有重跑')
else:
db_service.batch_insert_page_text({
'file_id': file_id,
'page_num' : pagenum+1,
'text' : line_texts
},conn,cursor)
except Exception as e:
print(f'{pagenum}页处理异常')
print(e)
def get_table_unit_info(file_id,line_text,page_num,table_index):
table_info = {}
table_info['file_id'] = file_id
match = unit_pattern.search(line_text)
if match:
unit = match.group(2)
table_info['unit'] = unit
table_info['page_num'] = page_num
table_info['table_index'] = table_index
#print(table_info)
return table_info
def get_table_text_info(file_id,line_text,page_num,table_index):
table_info = {}
table_info['file_id'] = file_id
table_info['text_info'] = line_text
table_info['page_num'] = page_num
table_info['table_index'] = table_index
#print(table_info)
return table_info
# 读取pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
def get_table_measure(file_id, pdf_tables, record_range):
"""
:return: pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
"""
try:
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
conn_app = mysql.connector.connect(
host = MYSQL_HOST_APP,
user = MYSQL_USER_APP,
password = MYSQL_PASSWORD_APP,
database = MYSQL_DB_APP
)
# 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True)
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
cursor.execute(select_year_select)
record_select = cursor.fetchall()
report_type = record_select[0][0]
report_year = record_select[0][1]
client = MilvusClient(
uri= MILVUS_CLIENT
)
print('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
start = time.time()
record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1]
for index in range(int(record_start),int(record_end)):
t = pdf_tables[index]
measure_obj =[]
data_dict = {}
measure_list = []
try:
arr = np.array(t['data'])
rows, cols = arr.shape
if rows == 1 and cols == 1:
continue
row_num , col_num = -1 , -1
# 使用嵌套循环遍历数组,获取第一个数值位置
for i in range(rows):
for j in range(cols):
if j == 0 or i == 0:#防止第一列识别出数字
continue
measure_value_config = str(arr[i, j]).replace('(','').replace(')','')
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config):
if j == cols-1:
row_num , col_num = i , j
break
elif (re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config)
or measure_value_config == '-'):
row_num , col_num = i , j
break
else:
continue
break
# 遍历数值二维数组,转成带语义的指标
if row_num != -1 and col_num != -1:
for i in range(row_num,arr.shape[0]):
for j in range(col_num,arr.shape[1]):
measure_value = str(arr[i, j]).replace('%','').replace('(','-').replace(')','')
if measure_value == '-' or measure_value == '' or len(measure_value) > 20:
continue
else:
row_num_info = get_row_num_info(arr,row_num,col_num,i,j)
col_num_info = get_col_num_info(arr,row_num,col_num,i,j)
#如果上表头为空则认为是被截断,除了研发投入特殊处理其它过滤
if row_num_info in ('','-',')',''):
continue
#特殊处理非经常性损益合计和非经常性损益净额同时出现时保留净额
if col_num_info == '非经常性损益合计':
continue
if utils.check_pdf_measure_black_list(f"{col_num_info}{row_num_info}"):
continue
#去掉没有周期的指标
if utils.check_pdf_measure(f"{col_num_info}{row_num_info}"):
continue
#判断上表头和左表头周期是否一致,不一致过滤
row_period = utils.get_period_type_other(row_num_info, report_year)
col_period = utils.get_period_type_other(col_num_info, report_year)
if(row_period != col_period and row_period != 'c_n' and col_period != 'c_n'):
continue
units_mapping = {
"百万元": "百万元",
"千万元": "千万元",
"亿元": "亿元",
"万元": "万元",
"千元": "千元",
"": "",
"元/股": ""
}
row_num_info = row_num_info.replace('%','增减')
#num_info = f"{col_num_info}{row_num_info}".replace('','').replace('加:','').replace('减:','').replace('%','')
num_info = utils.get_clean_text(f"{row_num_info}{col_num_info}")
num_info_bak = utils.get_clean_text(f"{col_num_info}{row_num_info}")
measure_unit = ''
#"%": "同期增减"
combined_info = f"{row_num_info} {col_num_info}"
# for unit in units_mapping:
# if unit in row_num_info:
# measure_unit = units_mapping[unit]
# break
if utils.get_percent_flag(row_num_info) == '1':
measure_unit = ''
else:
for unit in units_mapping:
if re.search(rf'\\s*{unit}(\s*人民币)?\s*\|\(\s*{unit}(\s*人民币)?\s*\)', combined_info) or (re.search(rf'{unit}', combined_info) and any(re.search('单位', item) for item in arr[0])):
measure_unit = units_mapping[unit]
break
measure_list.append({
'measure_name': num_info,
'measure_value': measure_value,
'measure_unit':measure_unit,
})
measure_list.append({
'measure_name': num_info_bak,
'measure_value': measure_value,
'measure_unit':measure_unit,
})
if not redis_client.exists(f'parsed_measure_count_{file_id}'):
redis_client.set(f'parsed_measure_count_{file_id}', 0)
redis_client.incr(f'parsed_measure_count_{file_id}')
if len(measure_list) > 0:
data_dict["measure_list"] = measure_list
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
data_dict['file_id'] = file_id
measure_obj.append(data_dict)
db_service.insert_measure_data_to_milvus(client,measure_obj,cursor_app,conn_app)
except Exception as e:
print(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
print(f"错误是:{e}")
end = time.time()
print('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
except Exception as e:
print(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1]
for index in range(int(record_start),int(record_end)):
t = pdf_tables[index]
measure_obj =[]
data_dict = {}
measure_list = []
try:
arr = np.array(t['data'])
except Exception as e:
print(f'这个错误是{e}的arr的值是{arr}')
finally:
redis_client.close()
client.close()
cursor.close()
conn.close()
cursor_app.close()
conn_app.close()
#多进程任务分发,根据参数判断是调表格还是正文
def dispatch_job(job_info):
try:
type = job_info['type']
path = job_info['path']
file_id = job_info['file_id']
page_num = job_info['page_num']
tables_range = job_info['tables_range']
if type == 'table':
get_table_range(path, file_id, page_num, tables_range)
except Exception as e:
print(e)
#指标归一化处理
def update_measure_data(file_id,file_path,parent_table_pages):
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
# #通过向量查询指标
conn_app = mysql.connector.connect(
host = MYSQL_HOST_APP,
user = MYSQL_USER_APP,
password = MYSQL_PASSWORD_APP,
database = MYSQL_DB_APP
)
# 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True)
print(f'目录黑名单为:{parent_table_pages}')
db_service.delete_to_run(conn,cursor,file_id)
db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
# #指标归一化处理
db_service.update_ori_measure(conn,cursor,file_id)
#db_service.delete_database(conn_app,cursor_app,file_id)
cursor.close()
conn.close()
cursor_app.close()
conn_app.close()
def merge_consecutive_arrays(pdf_info):
merged_objects = []
temp_array = {}
for info_obj in pdf_info:
try:
if info_obj['type'] == 'table':
# 如果对象是表格,将其元素添加到临时列表中
if not temp_array.get('page_num'):
temp_array = info_obj
#else:
# temp_array['data'].extend(info_obj['data'])
elif len(temp_array['data'][0]) == len(info_obj['data'][0]):
temp_array['data'].extend(info_obj['data'])
else:
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
else:
# 如果对象不是表格,检查临时列表是否为空
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
except Exception as e:
#print(info_obj)
print(f"解析数据错误: {e}")
if temp_array:
merged_objects.append(temp_array)
return merged_objects
def merge_consecutive_arrays_v1(pdf_info):
merged_objects = []
temp_array = {}
def is_same_dimension(data1, data2):
# 检查两个表的每行长度是否相同
if len(data1) != len(data2):
return False
return all(len(row1) == len(row2) for row1, row2 in zip(data1, data2))
for info_obj in pdf_info:
try:
if info_obj['type'] == 'table':
if not temp_array:
# 如果临时列表为空,则初始化临时列表
temp_array = info_obj
else:
# 检查当前表与临时列表中的表是否同维度
if is_same_dimension(temp_array['data'], info_obj['data']):
# 如果是同维度,则合并数据
temp_array['data'].extend(info_obj['data'])
else:
# 如果不是同维度,将现有临时列表添加到结果中,并重置临时列表
merged_objects.append(temp_array)
temp_array = info_obj
else:
# 如果对象不是表格,检查临时列表是否非空
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
except Exception as e:
print(f"解析数据错误: {e}")
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
if temp_array:
merged_objects.append(temp_array)
return merged_objects
def start_table_measure_job(file_id):
conn_app = mysql.connector.connect(
host = MYSQL_HOST_APP,
user = MYSQL_USER_APP,
password = MYSQL_PASSWORD_APP,
database = MYSQL_DB_APP
)
# 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True)
select_process_query = '''
select content from pdf_parse_process WHERE file_id = '{file_id}' and type='parse_table'
'''.format(file_id=file_id)
cursor_app.execute(select_process_query)
records = cursor_app.fetchall()
pdf_info = []
for record in records:
pdf_info.append(eval(record[0]))
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client.set(f'measure_count_{file_id}', len(pdf_tables))
cursor_app.close()
conn_app.close()
redis_client.close()
records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT)
print(f'records_range_part识别页码的值为{records_range_parts}')
processes = []
for record_range in records_range_parts:
p = Process(target=get_table_measure, args=(file_id,pdf_tables,record_range,))
processes.append(p)
p.start()
for p in processes:
p.join()
if __name__ == "__main__":
file_id = '1778'
page_num = 11
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True)
select_process_query = '''
select content from pdf_parse_process WHERE file_id = '{file_id}' and type='parse_table'
and page_num in(41,42,43)
'''.format(file_id=file_id, page_num=page_num)
cursor.execute(select_process_query)
records = cursor.fetchall()
pdf_info = []
for record in records:
pdf_info.append(eval(record[0]))
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
get_table_measure(file_id,pdf_tables,'0-2')

View File

@ -0,0 +1,269 @@
from docx import Document
import json
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from lxml import etree
import os
import zipfile
RESULT_TYPE_TEXT = 'text'
RESULT_TYPE_TABLE = 'table'
def build_result(result_type, index, data):
return {
'type': result_type,
'index': index,
'data': data
}
def build_catalog_result(index, depth, data):
return {
'index': index,
'depth': depth,
'data': data
}
# 解析docx文件中的XML内容
def get_xml_content(docx_filename, xml_filename):
with zipfile.ZipFile(docx_filename) as z:
return z.read(xml_filename)
def parse_paragraph(paragraph, index, namespaces):
paragraph_text = paragraph.text.strip() if paragraph else ''
if paragraph_text:
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
return None
def parse_table(table, index):
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
return build_result(RESULT_TYPE_TABLE, index, table_data)
def parse_paragraph_element(paragraph_element, index, namespaces):
paragraph_xml = etree.fromstring(paragraph_element.xml)
paragraph_text = ''.join(paragraph_xml.xpath('//w:t/text()', namespaces=namespaces)).strip()
if paragraph_text:
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
return None
def parse_table_element(table_element, index, namespaces):
table_xml = etree.fromstring(table_element.xml)
table_data = []
for row in table_xml.xpath('//w:tr', namespaces=namespaces):
row_data = []
for cell in row.xpath('./w:tc | ./w:sdt', namespaces=namespaces):
cell_text = ''.join(cell.xpath('.//w:t/text()', namespaces=namespaces)).strip()
grid_span_xpath = etree.XPath('w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
if grid_span > 1:
row_data.extend([cell_text] * grid_span)
else:
row_data.append(cell_text)
table_data.append(row_data)
return build_result(RESULT_TYPE_TABLE, index, table_data)
def add_to_catalog(element_xml, index, catalog_content, namespaces, paragraph_text, heading_styles):
p_element = etree.fromstring(element_xml)
# outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
# if outlineLvl:
# level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
# catalog_content.append(build_catalog_result(index, level, paragraph_text))
level = is_heading_paragraph(p_element, heading_styles, namespaces)
if level != -1:
catalog_content.append(build_catalog_result(index, level, paragraph_text))
# 检查段落是否为标题样式
def is_heading_paragraph(paragraph, heading_styles, namespaces):
pPr = paragraph.find('.//w:pPr', namespaces=namespaces)
if pPr is not None:
pStyle = pPr.find('.//w:pStyle', namespaces=namespaces)
pOutLineLvl = pPr.find('.//w:outlineLvl', namespaces=namespaces)
if pStyle is not None:
style_val = pStyle.get(f"{{{namespaces['w']}}}val")
if style_val.isdigit():
return int(style_val)
if pOutLineLvl is not None:
outLineLvl_val = pOutLineLvl.get(f"{{{namespaces['w']}}}val")
if outLineLvl_val.isdigit():
return int(outLineLvl_val) + 1
# if pStyle is not None and pStyle.get(ns['w'] + 'val') in heading_styles:
# if style_val > 0:
# return True
return -1
def get_paragraph_text(paragraph_element, namespaces):
paragraph_text = ''
for run in paragraph_element.findall('.//w:r', namespaces=namespaces):
for text in run.findall('.//w:t', namespaces=namespaces):
paragraph_text += text.text if text.text is not None else ''
return paragraph_text
def add_to_catalog_paragraph(text, index, catalog_content, namespaces):
# 添加段落到目录
catalog_content.append(build_catalog_result(index, 1, text)) # 假设默认级别为1
def parse_sdt_catalog(sdt_element, catalog_content, index, namespaces):
sdt_content = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
if sdt_content is not None:
for child in sdt_content:
if child.tag.endswith('p'): # 内容控件中的段落
paragraph_text = get_paragraph_text(child, namespaces)
if paragraph_text.strip(): # 检查文本是否为空
add_to_catalog_paragraph(paragraph_text, index, catalog_content, namespaces)
index += 1 # 更新索引
elif child.tag.endswith('tbl'): # 内容控件中的表格
# 处理表格内容(如果需要)
pass
elif child.tag.endswith('sdt'): # 嵌套的内容控件
index = parse_sdt_catalog(child, catalog_content, index, namespaces) # 递归解析嵌套的内容控件
return index
def parse_docx(docx_path):
try:
document = Document(docx_path)
styles_xml = get_xml_content(docx_path, 'word/styles.xml')
except Exception as e:
print(f"Error loading document: {e}")
return None, None
doc_content = [] # 内容(文本+表格)
catalog_content = [] # 目录
current_index = 1 # 维护全局的 index 变量
paragraph_index = 0
table_index = 0
# 获取整个文档的XML内容
xml_root = document.part.element
namespaces = xml_root.nsmap
# 获取所有标题样式
styles_root = etree.fromstring(styles_xml)
heading_styles = set()
for style in styles_root.xpath('//w:style', namespaces=namespaces):
style_type = style.get(namespaces['w'] + 'type')
if style_type == 'paragraph' and style.get(namespaces['w'] + 'styleId').startswith('Heading'):
heading_styles.add(style.get(namespaces['w'] + 'styleId'))
# 遍历文档中的所有元素
for i, element in enumerate(document.element.body):
if isinstance(element, CT_P): # 段落
paragraph_result = parse_paragraph_element(element, current_index, namespaces)
if paragraph_result:
doc_content.append(paragraph_result)
# 判断是否为目录,是就插入目录内容
paragraph = document.paragraphs[paragraph_index]
add_to_catalog(paragraph._element.xml, current_index, catalog_content, namespaces, paragraph.text, heading_styles)
current_index += 1 # 更新 index
paragraph_index += 1
elif isinstance(element, CT_Tbl): # 表格
table_result = parse_table_element(element, current_index, namespaces)
if table_result:
doc_content.append(table_result)
current_index += 1 # 更新 index
table_index += 1
elif element.tag.endswith('sdt'): # 内容控件
current_index = parse_sdt(element, doc_content, current_index, namespaces, catalog_content, heading_styles) # 更新索引
return json.dumps(doc_content, indent=4, ensure_ascii=False), json.dumps(catalog_content, indent=4, ensure_ascii=False)
def parse_sdt(sdt_element, doc_content, current_index, namespaces, catalog_content, heading_styles):
sdtContent = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
if sdtContent is not None:
for child in sdtContent:
if child.tag.endswith('p'): # 内容控件中的段落
paragraph_text = ''
for run in child.findall('.//w:r', namespaces=namespaces):
for text in run.findall('.//w:t', namespaces=namespaces):
paragraph_text += text.text if text.text is not None else ''
if paragraph_text.strip(): # 检查文本是否为空
doc_content.append(build_result(RESULT_TYPE_TEXT, current_index, paragraph_text.strip()))
# 判断是否为目录,是就插入目录内容
add_to_catalog(child.xml, current_index, catalog_content, namespaces, paragraph_text, heading_styles)
current_index += 1 # 更新索引
elif child.tag.endswith('tbl'): # 内容控件中的表格
table_data = []
merged_cells = {} # 用于记录跨行单元格的信息
for row_idx, row in enumerate(child.findall('.//w:tr', namespaces=namespaces)):
row_data = []
for col_idx, cell in enumerate(row.findall('.//w:tc', namespaces=namespaces)):
cell_text = ''
for run in cell.findall('.//w:r', namespaces=namespaces):
for text in run.findall('.//w:t', namespaces=namespaces):
cell_text += text.text if text.text is not None else ''
# 检查单元格是否跨列
grid_span_xpath = etree.XPath('w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
if grid_span > 1:
row_data.extend([cell_text.strip()] * grid_span)
else:
row_data.append(cell_text.strip())
# 检查单元格是否跨行
v_merge_xpath = etree.XPath('w:tcPr/w:vMerge/@w:val', namespaces=namespaces)
v_merge = v_merge_xpath(cell)
if v_merge and v_merge[0] == 'restart':
merged_cells[(row_idx, col_idx)] = (int(grid_span), 1)
elif v_merge and v_merge[0] == 'continue':
if (row_idx - 1, col_idx) in merged_cells:
merged_cells[(row_idx - 1, col_idx)] = (merged_cells[(row_idx - 1, col_idx)][0], merged_cells[(row_idx - 1, col_idx)][1] + 1)
# 跨行单元格不需要再次添加到 row_data 中
else:
# 只有非跨行单元格才需要添加到 row_data 中
pass
# 处理跨行单元格
for (r, c), (col_span, row_span) in list(merged_cells.items()):
if r < row_idx:
for i in range(row_span):
if r + i == row_idx:
row_data[c:c] = [row_data[c]] * (col_span - 1)
break
if r + row_span - 1 == row_idx:
del merged_cells[(r, c)]
table_data.append(row_data)
if table_data: # 检查表格数据是否为空
doc_content.append(build_result(RESULT_TYPE_TABLE, current_index, table_data))
current_index += 1 # 更新索引
elif child.tag.endswith('sdt'): # 嵌套的内容控件
current_index = parse_sdt(child, doc_content, current_index, namespaces, catalog_content, heading_styles) # 递归解析嵌套的内容控件
return current_index # 返回更新后的索引
def split_text_table(json_data):
# 分组
text_elements = [element for element in json_data if element['type'] == 'text']
table_elements = [element for element in json_data if element['type'] == 'table']
# 转换为JSON字符串
text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
return text_elements_json, table_elements_json
def append_to_file(file_path, text):
try:
with open(file_path, 'a', encoding='utf-8') as file:
file.write(text + '\n')
except Exception as e:
print(f"Error writing to file: {e}")
if __name__ == "__main__":
current_directory = os.getcwd()
docx_relative_path = 'file/docx/101.docx'
file_relative_path = 'file/docx/test1.txt'
docx_path = os.path.join(current_directory, docx_relative_path)
file_path = os.path.join(current_directory, file_relative_path)
try:
parsed_content, catalog_content = parse_docx(docx_path)
if parsed_content and catalog_content:
json_parsed_content = json.loads(parsed_content)
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
append_to_file(file_path, text_elements_json)
append_to_file(file_path, table_elements_json)
append_to_file(file_path, catalog_content)
except Exception as e:
print(f"Error parse_docx: {e}")

View File

@ -0,0 +1,108 @@
from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
import mysql.connector
from http import HTTPStatus
import dashscope
import random,re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
dashscope.api_key='sk-63c02fbb9b7d4b0494a3200bec1ae286'
def get_company_name(file_path):
line_text = ''
# 我们从PDF中提取页面,page_numbers=[4,5,6]
for pagenum, page in enumerate(extract_pages(file_path)):
if pagenum > 1:
break
# 找到所有的元素
page_elements = [(element.y1, element) for element in page._objs]
# 查找组成页面的元素
for i,component in enumerate(page_elements):
# 提取页面布局的元素
element = component[1]
# 检查该元素是否为文本元素
if isinstance(element, LTTextBoxHorizontal):
# 检查文本是否出现在表中
line_text += element.get_text()
return llm_service(line_text)
def llm_service(user_prompt):
system_prompt = '''
从以下数据报告中提取公司全称只需要提取中文公司全称不要增加其他内容如果提取不到公司全称请返回-
<数据报告>
<user_prompt>
</数据报告>
'''
system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
response = dashscope.Generation.call(
model='qwen-plus',
prompt = system_prompt,
seed=random.randint(1, 10000),
top_p=0.8,
result_format='message',
enable_search=False,
max_tokens=1500,
temperature=0.85,
repetition_penalty=1.0
)
if response.status_code == HTTPStatus.OK:
result = response['output']['choices'][0]['message']['content']
return result
else:
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
response.request_id, response.status_code,
response.code, response.message
))
return "llm_error"
def update_company_name(file_id, company_name, cursor, conn):
update_sql = f'''
UPDATE report_check
SET c_name = '{company_name}'
WHERE id = {file_id}
'''
cursor.execute(update_sql)
conn.commit()
if __name__ == '__main__':
conn = mysql.connector.connect(
host = MYSQL_HOST,
user = MYSQL_USER,
password = MYSQL_PASSWORD,
database = MYSQL_DB
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor()
data_query = '''
SELECT id,file_path FROM report_check where c_name is null
'''
cursor.execute(data_query)
data_list = cursor.fetchall()
for data in data_list:
try:
file_id = data[0]
file_path = f'/usr/local/zhanglei/financial/{data[1]}'
print(f'财报{file_id}开始解析')
# file_id = '1329'
# file_path = '/Users/zhengfei/Desktop/cb/zhangjun-600271-2023-nb-nb.pdf'
company_name = get_company_name(file_path)
contains_newline = '\n' in company_name
if contains_newline:
lines = company_name.splitlines(True)
company_name = lines[0]
if company_name != "llm_error":
update_company_name(file_id, company_name, cursor, conn)
except Exception as e:
print(f'财报{file_id}解析失败',e)
cursor.close()
conn.close()

240
zzb_data_word/pdf_title.py Normal file
View File

@ -0,0 +1,240 @@
import PyPDF2
import re
import os,threading
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
import redis
import db_service
def get_tree_pages(root, info, depth=0,title_array=[]):
"""
Recursively iterate the outline tree
Find the pages pointed by the outline item
and get the assigned physical order id
Decrement with padding if necessary
"""
if isinstance(root, dict):
# print(root)
page = root['/Page'].get_object()
# print(id(page))
t = root['/Title']
title = t
if isinstance(t, PyPDF2.generic.ByteStringObject):
title = t.original_bytes.decode('utf8')
title = title.strip()
title = title.replace('\n', '')
title = title.replace('\r', '')
page_num = info['all_pages'].get(id(page), 0)
if page_num == 0:
print('Not found page number for /Page!', page)
elif page_num < info['padding']:
page_num = 0
else:
page_num -= info['padding']
# str_val = '%-5d' % page_num
# str_val += '\t' * depth
# str_val += title + '\t' + '%3d' % page_num
# print(str_val)
title_array.append({
'title': title,
'page_num': page_num,
'depth': depth
})
for elem in root:
get_tree_pages(elem, info, depth+1,title_array)
return title_array
def recursive_numbering(obj, info):
"""
Recursively iterate through all the pages in order and assign them a physical
order number
"""
# print(id(obj), obj)
if obj['/Type'] == '/Page':
obj_id = id(obj)
if obj_id not in info['all_pages']:
info['all_pages'][obj_id] = info['current_page_id']
info['current_page_id'] += 1
return
elif obj['/Type'] == '/Pages':
for page in obj['/Kids']:
recursive_numbering(page.get_object(), info)
def get_numbers_between(numbers_between,start, end):
# 初始化一个空列表来存储两个数字之间的所有数字
# 遍历从开始数字到结束数字之间的每个数字
for i in range(start, end + 1):
# 将每个数字添加到列表中
numbers_between.append(i)
return numbers_between
def get_page_end(start, depth, title_array):
page_end = -1
for i in range(start, len(title_array)):
if title_array[i]['depth'] == depth:
page_end = title_array[i]['page_num']
break
return page_end
def get_file_split(page_count):
# 获取 CPU 核数
cpu_count = os.cpu_count()
if page_count < cpu_count:
cpu_count = page_count
# 使用 divmod() 函数计算除法结果和余数
quotient, remainder = divmod(page_count, cpu_count)
table_split_parts = []
text_split_parts = []
for i in range(cpu_count):
start_num = i * quotient
if i < cpu_count-1:
start_num = i * quotient
end_num = start_num+quotient
else:
end_num = page_count
table_split_parts.append(f'{start_num}-{end_num}')
text_split_parts.append(get_numbers_between([],start_num, end_num))
# 返回除法结果和余数
return {
'table_split_parts': table_split_parts,
'text_split_parts': text_split_parts
}
def create_text_outline(pdf_path, file_id):
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
# creating an object
with open(pdf_path, 'rb') as file:
file_info = {}
fileReader = PyPDF2.PdfReader(file)
page_count = len(fileReader.pages)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client.set(f'page_count_{file_id}', page_count)
info = {
'page_count': page_count,
'all_pages': {},
'current_page_id': 1,
'padding': 0
}
print('Number of pages: %d' % info['page_count'])
pages = fileReader.trailer['/Root']['/Pages'].get_object()
recursive_numbering(pages, info)
#for page_num, page in enumerate(pages['/Kids']):
# page_obj = page.getObject()
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
title_array = get_tree_pages(fileReader.outline, info, 0, [])
db_service.pdf_title_insert_mysql(file_id,title_array)
title_array = db_service.get_file_info_from_mysql(file_id)
parent_table_pages_local = {}
parent_table_pages_local[file_id] = []
print(f'{file_id}:{len(title_array)}')
for i in range(len(title_array)):
title_obj = title_array[i]
title = title_obj['title']
#print(f'标题分别是{title}')
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
page_start = title_obj['page_num']
depth = title_obj['depth']
if i < len(title_array) - 1:
page_end = title_array[i+1]['page_num']
if title_array[i]['depth'] in [1,2]:
page_end = get_page_end(i+1, depth, title_array)
else:
page_end = page_count
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
#当标题为母公司财务报表主要项目注释时最后一页不过滤避免核心roe指标无法召回
if len(re.findall('财务报表主要项目注释', title)) == 0:
page_end = page_end - 1
# print(title,page_start,page_end)
for i in range(page_start, page_end + 1):
# 将每个数字添加到列表中
parent_table_pages_local[file_id].append(i)
file_info['page_count'] = page_count
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
file_info['split_parts'] = get_file_split(page_count)
redis_client.close()
return file_info
def create_text_outline_disclosure(pdf_path, file_id):
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
# creating an object
with open(pdf_path, 'rb') as file:
file_info = {}
fileReader = PyPDF2.PdfReader(file)
page_count = len(fileReader.pages)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client.set(f'page_count_{file_id}', page_count)
info = {
'page_count': page_count,
'all_pages': {},
'current_page_id': 1,
'padding': 0
}
print('Number of pages: %d' % info['page_count'])
pages = fileReader.trailer['/Root']['/Pages'].get_object()
recursive_numbering(pages, info)
#for page_num, page in enumerate(pages['/Kids']):
# page_obj = page.getObject()
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
title_array = get_tree_pages(fileReader.outline, info, 0, [])
#db_service.pdf_title_insert_mysql(file_id,title_array)
#title_array = db_service.get_file_info_from_mysql(file_id)
parent_table_pages_local = {}
parent_table_pages_local[file_id] = []
print(f'{file_id}:{len(title_array)}')
for i in range(len(title_array)):
title_obj = title_array[i]
title = title_obj['title']
#print(f'标题分别是{title}')
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
page_start = title_obj['page_num']
depth = title_obj['depth']
if i < len(title_array) - 1:
page_end = title_array[i+1]['page_num']
if title_array[i]['depth'] in [1,2]:
page_end = get_page_end(i+1, depth, title_array)
else:
page_end = page_count
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
#当标题为母公司财务报表主要项目注释时最后一页不过滤避免核心roe指标无法召回
if len(re.findall('财务报表主要项目注释', title)) == 0:
page_end = page_end - 1
# print(title,page_start,page_end)
for i in range(page_start, page_end + 1):
# 将每个数字添加到列表中
parent_table_pages_local[file_id].append(i)
file_info['page_count'] = page_count
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
file_info['split_parts'] = get_file_split(page_count)
redis_client.close()
return file_info
if __name__ == '__main__':
import time
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
threading.Thread(target=create_text_outline, args=(path,'111')).start()
time.sleep(5)
threading.Thread(target=create_text_outline, args=(path,'222')).start()

View File

@ -0,0 +1,72 @@
#报错提示
import paramiko
import time
import threading
# 执行命令的函数
def execute_commands_on_server(hostname, username, password, host):
try:
# 连接到服务器
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=hostname, username=username, password=password)
# 执行命令
shell = client.invoke_shell()
#启动docker
shell.send("cd /root/pdf_parser/zzb_data_prod\n")
time.sleep(1)
shell.send("conda activate py310\n")
time.sleep(1)
shell.send("ps -ef | grep app_word.py | grep -v grep | awk '{print $2}' | xargs -r kill -9\n")
time.sleep(1)
shell.send("nohup python app_word.py > app.log 2>&1 &\n")
time.sleep(1)
# 读取输出
output = shell.recv(2048).decode()
print(f"Output from {hostname}:\n{output}")
except paramiko.SSHException as e:
print(f"SSH connection error with {hostname}: {e}")
finally:
client.close()
# 创建线程函数
def thread_function(server):
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
servers = [
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器1'},
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器2'},
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器3'},
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器4'},
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器5'},
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器6'},
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '监管服务器1'},
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '监管服务器2'},
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '监管服务器3'},
{'hostname': '1.94.106.10', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器1'},
{'hostname': '1.94.182.142', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器2'},
{'hostname': '119.3.153.192', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器3'},
# {'hostname': '192.168.0.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器2'},
# {'hostname': '192.168.0.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器3'},
#
]
# 创建并启动线程
threads = []
for server in servers:
thread = threading.Thread(target=thread_function, args=(server,))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print("All commands executed.")

View File

@ -0,0 +1,67 @@
#报错提示
import paramiko
import time
import threading
# 执行命令的函数
def execute_commands_on_server(hostname, username, password, host):
try:
# 连接到服务器
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=hostname, username=username, password=password)
# 执行命令
shell = client.invoke_shell()
#启动docker
shell.send("cd /root/pdf_parser/zzb_data_prod\n")
time.sleep(1)
shell.send("conda activate py310\n")
time.sleep(1)
shell.send("ps -ef | grep app.py | grep -v grep | awk '{print $2}' | xargs -r kill -9\n")
time.sleep(1)
shell.send("nohup python app.py > app.log 2>&1 &\n")
time.sleep(1)
# 读取输出
output = shell.recv(2048).decode()
print(f"Output from {hostname}:\n{output}")
except paramiko.SSHException as e:
print(f"SSH connection error with {hostname}: {e}")
finally:
client.close()
# 创建线程函数
def thread_function(server):
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
servers = [
{'hostname': '192.168.0.163', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器1'},
{'hostname': '192.168.0.26', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器2'},
{'hostname': '192.168.0.2', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器3'},
{'hostname': '192.168.0.128', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器4'},
{'hostname': '192.168.0.136', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器5'},
{'hostname': '192.168.0.239', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器6'},
{'hostname': '192.168.0.108', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器1'},
{'hostname': '192.168.0.131', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器2'},
{'hostname': '192.168.0.205', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器3'},
# {'hostname': '192.168.0.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器2'},
# {'hostname': '192.168.0.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器3'},
#
]
# 创建并启动线程
threads = []
for server in servers:
thread = threading.Thread(target=thread_function, args=(server,))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print("All commands executed.")

View File

@ -0,0 +1,81 @@
#报错提示
import paramiko
import time
import threading
# 执行命令的函数
def execute_commands_on_server(hostname, username, password, host):
try:
# 连接到服务器
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=hostname, username=username, password=password)
# 执行命令
shell = client.invoke_shell()
#启动docker
shell.send("cd /root/pdf_parser/zzb_data_word\n")
time.sleep(1)
shell.send("conda activate py310\n")
time.sleep(1)
shell.send("ps -ef | grep app_word.py | grep -v grep | awk '{print $2}' | xargs -r kill -9\n")
time.sleep(1)
shell.send("nohup python app_word.py > app_word.log 2>&1 &\n")
time.sleep(1)
# 读取输出
output = shell.recv(2048).decode()
print(f"Output from {hostname}:\n{output}")
except paramiko.SSHException as e:
print(f"SSH connection error with {hostname}: {e}")
finally:
client.close()
# 创建线程函数
def thread_function(server):
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
servers = [
# {'hostname': '192.168.0.163', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器1'},
# {'hostname': '192.168.0.26', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器2'},
# {'hostname': '192.168.0.2', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器3'},
# {'hostname': '192.168.0.128', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器4'},
# {'hostname': '192.168.0.136', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器5'},
# {'hostname': '192.168.0.239', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器6'},
# {'hostname': '192.168.0.108', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器1'},
# {'hostname': '192.168.0.131', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器2'},
#{'hostname': '192.168.0.205', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器3'},
# {'hostname': '192.168.0.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器2'},
# {'hostname': '192.168.0.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器3'},
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器1'},
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器2'},
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器3'},
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器4'},
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器5'},
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器6'},
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器1'},
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器2'},
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器3'},
{'hostname': '1.94.106.10', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器1'},
{'hostname': '1.94.182.142', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器2'},
{'hostname': '119.3.153.192', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器3'},
]
# 创建并启动线程
threads = []
for server in servers:
thread = threading.Thread(target=thread_function, args=(server,))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print("All commands executed.")

30
zzb_data_word/put_code.sh Normal file
View File

@ -0,0 +1,30 @@
#!/bin/bash
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
#FILES="/root/project/zzb_data_word/redis_service.py /root/project/zzb_data_word/zzb_logger.py /root/project/zzb_data_word/parse_word.py /root/project/zzb_data_word/config.py /root/project/zzb_data_word/utils.py /root/project/zzb_data_word/db_service_word.py /root/project/zzb_data_word/app_word.py /root/project/zzb_data_word/main_word.py /root/project/zzb_data_word/word_title.py"
FILES="/root/project/zzb_data_word/parse_word.py"
DEST_PATH="/root/pdf_parser/zzb_data_word"
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
#监管服务器
#SERVERS=("192.168.0.108" "192.168.0.131")
#企业服务器
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
#两者一起
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
#测试
#SERVERS=("192.168.0.103" "192.168.0.13")
#1013生产企业+监管)
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131" "192.168.0.205")
# 生产更新
SERVERS=("124.71.149.225" "1.94.143.23" "1.94.60.103" "124.71.157.162" "123.60.16.225" "1.94.101.237" "113.44.72.157" "113.44.52.221" "121.37.137.13")
# 遍历每个服务器并上传文件
for SERVER in "${SERVERS[@]}"; do
echo "Uploading files to $SERVER"
scp -r $FILES root@$SERVER:$DEST_PATH
echo "Finished uploading to $SERVER"
done

View File

@ -0,0 +1,30 @@
#!/bin/bash
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
#FILES="/root/project/zzb_data_word/redis_service.py /root/project/zzb_data_word/zzb_logger.py /root/project/zzb_data_word/parse_word.py /root/project/zzb_data_word/config.py /root/project/zzb_data_word/utils.py /root/project/zzb_data_word/db_service_word.py /root/project/zzb_data_word/app_word.py /root/project/zzb_data_word/main_word.py /root/project/zzb_data_word/word_title.py"
FILES="/root/project/zzb_data_prod/pdf_company.py"
DEST_PATH="/root/pdf_parser/zzb_data_prod"
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
#监管服务器
#SERVERS=("192.168.0.108" "192.168.0.131")
#企业服务器
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
#两者一起
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
#测试
#SERVERS=("192.168.0.103" "192.168.0.13")
#1013生产企业+监管)
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131" "192.168.0.205")
# 生产更新
SERVERS=("124.71.149.225" "1.94.143.23" "1.94.60.103" "124.71.157.162" "123.60.16.225" "1.94.101.237" "113.44.72.157" "113.44.52.221" "121.37.137.13" "1.94.106.10" "1.94.182.142" "119.3.153.192")
# 遍历每个服务器并上传文件
for SERVER in "${SERVERS[@]}"; do
echo "Uploading files to $SERVER"
scp -r $FILES root@$SERVER:$DEST_PATH
echo "Finished uploading to $SERVER"
done

View File

260
zzb_data_word/redis_init.py Normal file
View File

@ -0,0 +1,260 @@
#coding=utf-8
import sys,ast
# from pdfminer.high_level import extract_text
# from pdfminer.pdfparser import PDFParser
# from pdfminer.pdfdocument import PDFDocument
# from pdfminer.pdfpage import PDFPage
import utils
import mysql.connector
# from pymilvus import connections,MilvusClient
import json,time
# import db_service
import ast
import numpy as np
import config_p
import redis_service
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
# import main
import redis
def run_job(sec):
time.sleep(sec)
def measure_config_to_db(conn,cursor):
insert_query = '''
INSERT INTO measure_config_half_year
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
VALUES (%s, %s, %s, %s, %s)
'''
# 打开文本文件
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
# 读取所有行到一个列表中
lines = file.readlines()
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor):
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
# 执行SQL语句更新数据
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
'''
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
# def contains_financial_indicators(text):
# import re
# # 正则表达式模式匹配千分位格式的数字和百分比
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
# pattern1 = r"\d+(.\d+)+%?"
# # 使用 re.search 函数查找匹配项
# match = re.search(pattern1, text)
# # 如果找到匹配项,返回 True否则返回 False
# return bool(match)
# def get_clean_text(text):
# import re
# pattern = r"\[^)]*?\"
# matches = re.findall(pattern, text)
# for match in matches:
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
# month_keywords_found = re.search(r"归属于|扣非", match)
# if not month_keywords_found:
# # 如果包含,则从文本中删除该部分
# text = re.sub(pattern,"", text)
# else:
# # 如果不包含,删除所有标点符号和中文数字
# text = re.sub(r"[^\w\s]", "", text)
# print(text)
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
# # #通过向量查询指标
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
# # #指标归一化处理
# db_service.update_ori_measure(conn,cursor,file_id)
# def print_measure_data(cursor,client):
# select_query = '''
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
# '''
# cursor.execute(select_query)
# records = cursor.fetchall()
# for record in records:
# ori_measure_name = record[0]
# measure_name = record[1]
# ori_measure_id = record[2]
# measure_vector = redis_service.read_from_redis(ori_measure_id)
# measure_list = ast.literal_eval(measure_vector)
# data = [measure_list]
# res = client.search(
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=2, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name","measure_value","table_num","table_index"],
# filter = 'file_id == "64"'
# )
# vector_str = measure_name+":"+ori_measure_name
# # Convert the output to a formatted JSON string
# for i in range(len(res[0])):
# vector_distance = float(res[0][i]["distance"])
# vector_measure_name = res[0][i]["entity"]["measure_name"]
# measure_value = res[0][i]["entity"]["measure_value"]
# table_num = res[0][i]["entity"]["table_num"]
# table_index = res[0][i]["entity"]["table_index"]
# table_num_list = [106]
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
# # if vector_distance > 0.89 and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
# # if vector_distance > distance and table_num not in table_num_list:
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
if __name__ == "__main__":
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
# print(vector)
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# measure_config_to_db(conn,cursor)
insert_measure_vector(conn,cursor)
# cursor.close()
# conn.close()
# import re
# text = '减少11.04百分点'
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
# print('找到了单位。')
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
# match = unit_pattern.search(text)
# print(len(match.groups()))
# if match:
# print(f'找到单位。')
# else:
# print(f'没有找到单位。')
# row1 = ['比例','比率','占比','费用']
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
# for i in range(len(row1)):
# for j in range(len(row2)):
# print(f"{row1[i]}{row2[j]}")
# import os,re
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
# # 获取文件名和扩展名
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
# file_base_name = re.sub(r'\d+', '', file_base_name)
# print(f'文件名: {file_base_name}')
# import re
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
# import threading
# # 创建一个ThreadLocal变量
# local_data = threading.local()
# # 定义一个线程执行的工作函数
# def worker():
# # 为当前线程的ThreadLocal变量设置一个值
# local_data.data = f"Thread {threading.current_thread().name}'s data"
# print(local_data.data)
# # 创建并启动多个线程
# threads = []
# for i in range(3):
# thread = threading.Thread(target=worker)
# thread.start()
# threads.append(thread)
# # 等待所有线程完成
# for thread in threads:
# thread.join()
# for i in range(2,5):
# print(i)
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
# client = MilvusClient(
# uri= MILVUS_CLIENT
# )
# conn = mysql.connector.connect(
# host=MYSQL_HOST,
# user=MYSQL_USER,
# password=MYSQL_PASSWORD,
# database=MYSQL_DB
# )
# cursor = conn.cursor()
# print_measure_data(cursor,client)
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
# redis_service.read_from_redis()
# parent_table_pages = []
# file_id = '67'
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
# db_service.update_ori_measure(conn,cursor,file_id)
# main.get_table_measure(path,'all',file_id)
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
# measure_config_to_db(conn,cursor)
# params = ['f_102','f_103',]
# for param in params:
# globals()[param] = param.replace('f_','')
# # insert_measure_vector(conn,cursor)
# print(globals()['f_102'])
# db_service.update_ori_measure(conn,cursor,file_id)
# conn.commit()
# cursor.close()
# conn.close()
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
# count_range_parts = utils.get_range(2300)
# print(count_range_parts)

View File

@ -0,0 +1,198 @@
import pandas as pd
import mysql.connector
import utils
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
import redis_service
import redis
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
# 读取第一个 Excel 文件
df = pd.read_excel(input_excel_path1, sheet_name='Sheet7', header=0)#对应ttt表
# 将 DataFrame 转换为字典列表
data_list = df.to_dict(orient='records')
# 连接到 MySQL 数据库
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# 插入数据到 measure_create_config 表
insert_query = '''
INSERT INTO measure_create_config
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
VALUES (%s, %s, %s, %s, %s, %s)
'''
for data in data_list:
show_measure = str(data['指标'])
same_mean_measure = str(data['同义表述'])
period_measure = str(data['周期'])
change_measure = str(data['变动'])
black_list = str(data['黑名单词'])
config_id = utils.get_md5(show_measure)
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
cursor.execute(insert_query, insert_query_data)
conn.commit()
# 读取第二个 Excel 文件
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet11', header=0)#对应周期表
# 将 DataFrame 转换为字典列表
period_list = df_period.to_dict(orient='records')
# 插入数据到 measure_create_period 表
period_insert_query = '''
INSERT INTO measure_create_period
(period_name, same_mean_period)
VALUES (%s, %s)
'''
for data in period_list:
period_name = str(data['标准表述'])
same_mean_period = str(data['同义表述'])
insert_query_data = (period_name, same_mean_period)
cursor.execute(period_insert_query, insert_query_data)
conn.commit()
# 查询数据库
data_query = '''
SELECT * FROM measure_create_config WHERE delete_status = 0
'''
period_query = '''
SELECT * FROM measure_create_period
'''
cursor.execute(data_query)
data_list = cursor.fetchall()
cursor.execute(period_query)
period_list = cursor.fetchall()
# 输出到文件
with open(output_file_path, 'w', encoding='utf-8') as file:
for data in data_list:
config_id = data[0]
show_measure = data[1]
same_mean_measure = data[2]
period_measure = data[3]
change_measure = data[4]
same_mean_measure_arr = []
period_measure_arr = []
change_measure_arr = []
if same_mean_measure != 'nan':
same_mean_measure_arr = same_mean_measure.split(',')
same_mean_measure_arr.append(show_measure)
if period_measure != 'nan':
period_measure_arr = period_measure.split(',')
if change_measure != 'nan':
change_measure_arr = change_measure.split(',')
for c in change_measure_arr:
period_measure_arr.append(c)
for x in period_measure_arr:
if x in change_measure_arr:
show_name = show_measure + x
else:
show_name = x + show_measure
for y in same_mean_measure_arr:
if x in change_measure:
parser_name = y + x
else:
parser_name = x + y
file.write(f'{show_name},{parser_name}\n')
for p in period_list:
period_exra_name = p[0]
period_exra_value = p[1]
if period_exra_name in x:
for v in period_exra_value.split(','):
if x in change_measure:
parser_name = y + x.replace(period_exra_name, v)
else:
parser_name = x.replace(period_exra_name, v) + y
file.write(f'{show_name},{parser_name}\n')
cursor.close()
conn.close()
def measure_config_to_db(conn, cursor, file_path):
insert_query = '''
INSERT INTO measure_config_third_quarter
(measure_id, measure_name, ori_measure_id, ori_measure_name)
VALUES (%s, %s, %s, %s)
'''
check_query = '''
SELECT ori_measure_id FROM measure_config_third_quarter
'''
# 打开文本文件
with open(file_path, 'r', encoding='utf-8') as file:
# 读取所有行到一个列表中
lines = file.readlines()
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
# 判断数据库中是否有数据
cursor.execute(check_query)
check_records = cursor.fetchall()
#if any(record[0] == ori_measure_id for record in check_records):
# continue
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor):
redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)# 192.168.0.172 #测试123.60.153.169
# 执行SQL语句更新数据
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config_1024
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
if __name__ == "__main__":
MYSQL_HOST = '121.37.185.246'
MYSQL_PORT = 3306
MYSQL_USER = 'financial'
MYSQL_PASSWORD = 'financial_8000'
MYSQL_DB = 'financial_report'
# 需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
process_excel_and_db(
'ttt_1.xlsx',#ttt文件
'period_1.xlsx',#period文件
'out_2022_new_year.txt'#输出文件
)
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
file_path = 'out_2022_new_year.txt'
measure_config_to_db(conn, cursor, file_path)
insert_measure_vector(conn,cursor)

View File

@ -0,0 +1,17 @@
import redis
# 从 MySQL 表中读取数据并写入 Redis
def read_from_file_and_write_to_redis(redis_client,ori_measure_id,measure_vector):
# Redis 连接配置
redis_client.hset('measure_config',ori_measure_id, measure_vector)
# 从 Redis 中读取数据
def read_from_redis(redis_client,ori_measure_id):
# 获取所有键
return redis_client.hget('measure_config',ori_measure_id).decode()
# if __name__ == "__main__":
# # redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
# redis_client = redis.Redis(host='124.70.129.232', port=6379, password='Xgf_redis', db=6)
#
# value = read_from_redis(redis_client,"92b44ffb50b6ab2068f5de447c9925")
# print(value)

View File

@ -0,0 +1,82 @@
import redis
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def migrate_redis(source_host, source_port, source_password, target_host, target_port, target_password):
try:
# 连接源 Redis
source_redis = redis.StrictRedis(host=source_host, port=source_port, password=source_password,
decode_responses=True)
# 连接目标 Redis
target_redis = redis.StrictRedis(host=target_host, port=target_port, password=target_password,
decode_responses=True)
# 获取源 Redis 的数据库数量
db_count = int(source_redis.config_get('databases')['databases'])
logging.info(f"Total databases in source Redis: {db_count}")
# 遍历每个数据库
for db in range(db_count):
try:
# 切换到当前数据库
source_redis.select(db)
target_redis.select(db)
logging.info(f"Migrating data from DB {db}")
# 创建 pipeline
pipeline = target_redis.pipeline()
# 遍历当前数据库中的所有键
for key in source_redis.scan_iter():
try:
key_type = source_redis.type(key) # 获取键的类型
logging.info(f"Migrating key: {key} (Type: {key_type}) in DB {db}")
# 根据键的类型处理数据
if key_type == 'string':
value = source_redis.get(key)
pipeline.set(key, value)
elif key_type == 'hash':
hash_data = source_redis.hgetall(key)
pipeline.hset(key, mapping=hash_data) # 使用 hset 替代 hmset
elif key_type == 'list':
list_data = source_redis.lrange(key, 0, -1)
pipeline.rpush(key, *list_data)
elif key_type == 'set':
set_data = source_redis.smembers(key)
pipeline.sadd(key, *set_data)
elif key_type == 'zset':
zset_data = source_redis.zrange(key, 0, -1, withscores=True)
for member, score in zset_data:
pipeline.zadd(key, {member: score})
else:
logging.warning(f"Unsupported key type: {key_type} for key: {key} in DB {db}")
except Exception as e:
logging.error(f"Failed to migrate key: {key} in DB {db}. Error: {e}")
# 批量执行 pipeline
pipeline.execute()
logging.info(f"Migration completed for DB {db}")
except Exception as e:
logging.error(f"Failed to migrate DB {db}. Error: {e}")
logging.info("All databases migrated successfully!")
except Exception as e:
logging.error(f"Migration failed. Error: {e}")
# 配置源 Redis 和目标 Redis 的连接信息
source_host = '10.127.2.206'
source_port = 6379
source_password = "Xgf_redis"
target_host = '10.127.2.209'
target_port = 6379
target_password = "dMrt4kmwiW6LDJXy"
# 执行迁移
migrate_redis(source_host, source_port, source_password, target_host, target_port, target_password)

View File

@ -0,0 +1,14 @@
camelot-py==0.11.0
pdfminer.six==20221105
PyPDF2==3.0.1
pdfplumber==0.10.3
pymilvus==2.3.3
mysql-connector-python==8.3.0
dashscope==1.17.0
fastapi
pydantic
uvicorn
redis
ghostscript
opencv-python-headless
python-docx

View File

@ -0,0 +1,63 @@
import pandas as pd
import json
import utils
import mysql.connector
conn = mysql.connector.connect(
host = 'rm-bp1vns6jjy6yu46lhio.mysql.rds.aliyuncs.com',
user = 'hematiyu',
password = '00a09f971769499f8c0495505ab0922C',
database = 'ai_chat_mgmt_test'
)
# 创建一个cursor对象来执行SQL语句
cursor = conn.cursor()
excel_file_path = '/Users/zhengfei/Desktop/healthy_book.xlsx'
# 读取Excel文件
xls = pd.ExcelFile(excel_file_path)
# 遍历每个sheet
for sheet_name in xls.sheet_names:
# 读取sheet内容
df = pd.read_excel(xls, sheet_name, header=0)
# 将 DataFrame 转换为字典列表
data_list = df.to_dict(orient='records')
insert_query = '''
INSERT INTO ai_chat_book_info
(name, publish, author, isbn, pub_time, word_flag, category, keywords)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
'''
for data in data_list:
name = str(data['书名'])
if name == 'nan':
continue
publish = str(data['出版单位'])
author = str(data['作者']).replace('[', '')
isbn = str(data['ISBN'])
pub_time = str(data['年份'])
if pub_time == 'NaT':
pub_time = ''
else:
#提取前四位
pub_time = pub_time[:4]
word_flag = str(data['是否转换为word格式'])
if word_flag == 'nan':
word_flag = ''
category = str(data['分类'])
if category == 'nan':
category = ''
keywords = str(data['关键词'])
if keywords == 'nan':
keywords = ''
insert_query_data = (name, publish, author, isbn, pub_time, word_flag, category, keywords)
cursor.execute(insert_query, insert_query_data)
conn.commit()
cursor.close()
conn.close()

156
zzb_data_word/syc_table.py Normal file
View File

@ -0,0 +1,156 @@
import pymssql
import mysql.connector
import logging
from multiprocessing import Pool
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# SQL Server配置
sql_server_config = {
"server": "203.192.15.17", # SQL Server 的 IP 地址
"port": 28063, # SQL Server 的端口
"user": "zncbuser", # 用户名
"password": "ZZB-Cbindex-data", # 密码
"database": "jydb", # 数据库名称
}
# MySQL配置
mysql_config = {
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com", # MySQL 的 IP 地址
"user": "zzb_jydb", # 用户名
"password": "Ysdbsdjs89Yrqwp", # 密码
"database": "zzb_jydb", # 数据库名称
}
# 最大进程数
MAX_PROCESSES = 1
def sync_table_structure(table_name):
try:
# 连接到SQL Server
sql_server_conn = pymssql.connect(
server=sql_server_config["server"],
port=sql_server_config["port"],
user=sql_server_config["user"],
password=sql_server_config["password"],
database=sql_server_config["database"],
)
sql_server_cursor = sql_server_conn.cursor()
# 连接到MySQL
mysql_conn = mysql.connector.connect(**mysql_config)
mysql_cursor = mysql_conn.cursor()
logging.info(f"Processing table: {table_name}")
# 检查MySQL中是否已存在该表
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
table_exists = mysql_cursor.fetchone()
if not table_exists:
# 如果表不存在,创建表
sql_server_cursor.execute(f"""
SELECT
COLUMN_NAME,
DATA_TYPE,
CHARACTER_MAXIMUM_LENGTH,
NUMERIC_PRECISION,
NUMERIC_SCALE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = '{table_name}'
""")
columns = sql_server_cursor.fetchall()
# 生成MySQL的CREATE TABLE语句
create_table_sql = f"CREATE TABLE {table_name} ("
for col in columns:
col_name = col[0]
col_type = col[1]
# 获取字段长度
char_length = col[2]
numeric_precision = col[3]
numeric_scale = col[4]
# 简单类型映射(可能需要根据实际情况调整)
if col_type == "varchar":
col_type = "VARCHAR(255)"
elif col_type == "int":
col_type = "INT"
elif col_type == "datetime":
col_type = "DATETIME"
elif col_type == "decimal":
if numeric_precision and numeric_scale:
col_type = f"DECIMAL({numeric_precision}, {numeric_scale})"
else:
col_type = "DECIMAL(10, 2)" # 默认值
elif col_type == "money":
col_type = "DECIMAL(19, 4)"
elif col_type == "smallmoney":
col_type = "DECIMAL(19, 4)"
elif col_type == "image":
col_type = "LONGBLOB"
# 设置列的 NULL 属性
if col_name.lower() == "id":
# ID 列不允许 NULL
create_table_sql += f"`{col_name}` {col_type} NOT NULL, "
else:
# 其他列允许 NULL
create_table_sql += f"`{col_name}` {col_type} , "
# 添加主键约束(假设 ID 是主键)
create_table_sql = create_table_sql.rstrip(", ") + f", PRIMARY KEY ({columns[0][0]}))"
logging.info(f"Create table SQL: {create_table_sql}")
# 在MySQL中创建表
mysql_cursor.execute(create_table_sql)
logging.info(f"Table {table_name} created in MySQL.")
else:
logging.info(f"Table {table_name} already exists in MySQL. Skipping...")
# 关闭连接
sql_server_cursor.close()
sql_server_conn.close()
mysql_cursor.close()
mysql_conn.close()
logging.info(f"Sync completed for table: {table_name}")
except Exception as e:
logging.error(f"Failed to sync table {table_name}. Error: {e}")
def main():
try:
# 连接到SQL Server
sql_server_conn = pymssql.connect(
server=sql_server_config["server"],
port=sql_server_config["port"],
user=sql_server_config["user"],
password=sql_server_config["password"],
database=sql_server_config["database"],
)
sql_server_cursor = sql_server_conn.cursor()
# 获取SQL Server中的所有表
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
tables = sql_server_cursor.fetchall()
# 使用进程池并发处理每个表
with Pool(processes=MAX_PROCESSES) as pool:
pool.map(sync_table_structure, [table[0] for table in tables])
logging.info("All tables synced successfully!")
except Exception as e:
logging.error(f"Main function failed. Error: {e}")
finally:
# 关闭连接
if 'sql_server_cursor' in locals():
sql_server_cursor.close()
if 'sql_server_conn' in locals():
sql_server_conn.close()
# 启动主函数
if __name__ == "__main__":
main()

32
zzb_data_word/tables.txt Normal file
View File

@ -0,0 +1,32 @@
Run task text (35857)...
{'file_id': '5555', 'unit': '万元', 'page_num': 5, 'table_index': 2}
{'file_id': '5555', 'unit': '万元', 'page_num': 6, 'table_index': 1}
{'file_id': '5555', 'unit': '万元', 'page_num': 6, 'table_index': 2}
{'file_id': '5555', 'unit': '万元', 'page_num': 8, 'table_index': 2}
Task text runs 25.38 seconds.
{'top': 143.97104000000002, 'buttom': 133.41104, 'page_num': 2, 'type': 'text', 'content': '□适用 √不适用 ', 'sort_num': 1856.02896}
{'top': 197.01104, 'buttom': 110.61103999999999, 'page_num': 4, 'type': 'text', 'content': '公司注册地址 公司注册地址的历史变更情况 公司办公地址 公司办公地址的邮政编码 公司网址 电子信箱 ', 'sort_num': 3802.98896}
{'top': 196.41104, 'buttom': 110.13104, 'page_num': 4, 'type': 'text', 'content': '北京市海淀区杏石口路甲18号航天信息园 无 北京市海淀区杏石口路甲18号航天信息园 100195 http://www.aisino.com stock@aisino.com ', 'sort_num': 3803.58896}
{'top': 555.1410400000001, 'buttom': 530.90104, 'page_num': 5, 'table_index': 2, 'type': 'page_footer', 'content': '公司聘请的会计师事务所(境内) ', 'sort_num': 4444.85896}
{'top': 503.42104, 'buttom': 475.15000000000003, 'page_num': 5, 'table_index': 2, 'type': 'page_footer', 'content': '七、 近三年主要会计数据和财务指标 (一) 主要会计数据 ', 'sort_num': 4496.57896}
{'top': 470.18104, 'buttom': 459.62104, 'page_num': 5, 'table_index': 2, 'type': 'page_footer', 'content': '单位:万元 币种:人民币 ', 'sort_num': 4529.81896}
{'top': 458.15999999999997, 'buttom': 273.36, 'page_num': 5, 'table_index': 2, 'type': 'table', 'data': [['主要会计数据', '2023年', '2022年', '本期比上年同期增减(%)', '2021年'], ['营业收入', '1257482.20', '1931407.52', '-34.89', '2351554.42'], ['归属于上市公司股东的净利润', '20271.53', '107841.40', '-81.20', '102357.97'], ['归属于上市公司股东的扣除非经常性损益的净利润', '2704.30', '94848.41', '-97.15', '134689.72'], ['经营活动产生的现金流量净额', '87498.48', '186388.43', '-53.06', '187453.86'], ['', '2023年末', '2022年末', '本期末比上年同期末增减(%', '2021年末'], ['归属于上市公司股东的净资产', '1404971.01', '1414749.10', '-0.69', '1334971.44'], ['总资产', '2269076.18', '2370768.08', '-4.29', '2283286.86']], 'sort_num': 4541.84}
{'top': 241.68104000000002, 'buttom': 230.42408, 'page_num': 5, 'table_index': 3, 'type': 'page_footer', 'content': '(二) 主要财务指标 ', 'sort_num': 4758.31896}
{'top': 226.79999999999998, 'buttom': 86.88, 'page_num': 5, 'table_index': 3, 'type': 'table', 'data': [['主要财务指标', '2023年', '2022年', '本期比上年同期增减(%)', '2021年'], ['基本每股收益(元/股)', '0.11', '0.58', '-81.03', '0.55'], ['稀释每股收益(元/股)', '0.11', '0.58', '-81.03', '0.55'], ['扣除非经常性损益后的基本每股收益(元/股)', '0.01', '0.51', '-98.04', '0.73'], ['加权平均净资产收益率(%', '1.45', '7.90', '下降6.45个百分点', '7.95'], ['扣除非经常性损益后的加权平均净资产收益率(%', '0.19', '6.94', '下降6.75个百分点', '10.46']], 'sort_num': 4773.2}
{'top': 473.78103999999996, 'buttom': 446.66103999999996, 'page_num': 6, 'table_index': 1, 'type': 'page_footer', 'content': '(三) 境内外会计准则差异的说明: □适用 √不适用 ', 'sort_num': 5526.21896}
{'top': 426.98104, 'buttom': 415.6924, 'page_num': 6, 'table_index': 1, 'type': 'page_footer', 'content': '九、 2023 年分季度主要财务数据 ', 'sort_num': 5573.01896}
{'top': 410.30104, 'buttom': 399.74104, 'page_num': 6, 'table_index': 1, 'type': 'page_footer', 'content': '单位:万元 币种:人民币 ', 'sort_num': 5589.69896}
{'top': 398.4, 'buttom': 258.96, 'page_num': 6, 'table_index': 1, 'type': 'table', 'data': [['', '第一季度1-3月份', '第二季度4-6月份', '第三季度7-9月份', '第四季度10-12月份'], ['营业收入', '350592.33', '347900.54', '305553.10', '253436.23'], ['归属于上市公司股东的净利润', '24569.40', '28044.65', '-6452.14', '-25890.38'], ['归属于上市公司股东的扣除非经常性损益后的净利润', '-8313.07', '16084.23', '2564.63', '-7631.49'], ['经营活动产生的现金流量净额', '-116544.51', '51455.38', '-40594.82', '193182.43']], 'sort_num': 5601.6}
{'top': 243.84104000000002, 'buttom': 219.72104000000002, 'page_num': 6, 'table_index': 2, 'type': 'page_footer', 'content': '季度数据与已披露定期报告数据差异说明 □适用 √不适用 ', 'sort_num': 5756.15896}
{'top': 200.01104, 'buttom': 172.89104, 'page_num': 6, 'table_index': 2, 'type': 'page_footer', 'content': '十、 非经常性损益项目和金额 √适用 □不适用 ', 'sort_num': 5799.98896}
{'top': 169.77104, 'buttom': 159.21104, 'page_num': 6, 'table_index': 2, 'type': 'page_footer', 'content': '单位:万元 币种:人民币 ', 'sort_num': 5830.22896}
{'top': 157.92, 'buttom': 101.52, 'page_num': 6, 'table_index': 2, 'type': 'table', 'data': [['非经常性损益项目', '2023年金额', '附注(如适用)', '2022年金额', '2021年金额'], ['非流动性资产处置损益,包括已计提资产减值准备的冲销部分', '600.11', '-', '224.25', '814.45']], 'sort_num': 5842.08}
{'top': 765.8399999999999, 'buttom': 87.84, 'page_num': 7, 'table_index': 1, 'type': 'table', 'data': [['非经常性损益项目', '2023年金额', '附注(如适用)', '2022年金额', '2021年金额'], ['计入当期损益的政府补助,但与公司正常经营业务密切相关、符合国家政策规定、按照确定的标准享有、对公司损益产生持续影响的政府补助除外', '7847.97', '-', '12602.14', '12861.57'], ['除同公司正常经营业务相关的有效套期保值业务外,非金融企业持有金融资产和金融负债产生的公允价值变动损益以及处置金融资产和金融负债产生的损益', '18586.19', '主要是公司持有的中油资本股票处置收益以及公允价值变动损益。', '6233.14', '-40552.70'], ['计入当期损益的对非金融企业收取的资金占用费', '', '', '', ''], ['委托他人投资或管理资产的损益', '', '', '', ''], ['对外委托贷款取得的损益', '', '', '', ''], ['因不可抗力因素,如遭受自然灾害而产生的各项资产损失', '', '', '', ''], ['单独进行减值测试的应收款项减值准备转回', '1391.58', '', '', ''], ['企业取得子公司、联营企业及合营企业的投资成本小于取得投资时应享有被投资单位可辨认净资产公允价值产生的收益', '', '', '', ''], ['同一控制下企业合并产生的子公司期初至合并日的当期净损益', '', '', '', ''], ['非货币性资产交换损益', '', '', '', ''], ['债务重组损益', '', '', '', ''], ['企业因相关经营活动不再持续而发生的一次性费用,如安置职工的支出等', '', '', '', ''], ['因税收、会计等法律、法规的调整对当期损益产生的一次性影响', '', '', '', ''], ['因取消、修改股权激励计划一次性确认的股份支付费用', '', '', '', ''], ['对于现金结算的股份支付,在可行权日之后,应付职工的公允价值变动产生的损益', '', '', '', ''], ['采用公允价值模式进行后续计量的投资性房地产公允价值变动产生的损益', '', '', '', ''], ['交易价格显失公允的交易产生的收益', '', '', '', ''], ['与公司正常经营业务无关的或有事项产生的损益', '', '', '', ''], ['受托经营取得的托管费收入', '', '', '', '']], 'sort_num': 6234.16}
{'top': 765.8399999999999, 'buttom': 625.68, 'page_num': 8, 'table_index': 1, 'type': 'table', 'data': [['非经常性损益项目', '2023年金额', '附注(如适用)', '2022年金额', '2021年金额'], ['除上述各项之外的其他营业外收入和支出', '-6149.80', '-', '-1777.00', '1315.87'], ['其他符合非经常性损益定义的损益项目', '220.56', '-', '243.50', '226.16'], ['减:所得税影响额', '3750.45', '-', '1383.14', '2257.64'], ['少数股东权益影响额(税后)', '1178.93', '-', '3149.90', '4739.46'], ['合计', '17567.23', '', '12992.99', '-32331.75']], 'sort_num': 7234.16}
{'top': 539.3010400000001, 'buttom': 512.1810399999999, 'page_num': 8, 'table_index': 2, 'type': 'page_header', 'content': '十一、 采用公允价值计量的项目 √适用 □不适用 ', 'sort_num': 7460.69896}
{'top': 509.06104, 'buttom': 470.78103999999996, 'page_num': 8, 'table_index': 2, 'type': 'page_header', 'content': '单位:万元 币种:人民币 对当期利润的影响金额 ', 'sort_num': 7490.93896}
{'top': 497.03999999999996, 'buttom': 385.44, 'page_num': 8, 'table_index': 2, 'type': 'table', 'data': [['项目名称', '期初余额', '期末余额', '当期变动', '对当期利润的影响金额'], ['以公允价值计量且其变动计入当期损益的金融资产', '164526.24', '91132.43', '-73393.81', '20153.57'], ['指定为以公允价值计量且其变动计入其他综合收益的金融资产', '30864.33', '40186.93', '9322.60', '1078.00'], ['合计', '195390.57', '131319.36', '-64071.21', '21231.57']], 'sort_num': 7502.96}
{'top': 161.73104, 'buttom': 82.34304, 'page_num': 8, 'type': 'text', 'content': '改革作为推动转型升级的“关键一招”不断激发动力活力。一是围绕新时代国资央企“三个总”“三个作用”和新一轮国企改革“三个明显成效”要求系统研究形成“科改行动”和改革深化提升行动实施方案2023-2025 年)及工作台账,全面完成各项年度改革任务;二是加大改革力度、保持改革节奏,推动重要改革举措深化扩围,中长期激励工作成为航天科工集团先进典型;三是聚焦重点环节持续深化三项制度改革,压紧压实“一岗一表”差异化考核责任压力,经理层绩效年薪占年度薪酬比例 61.8%,管理人员不胜任退出率 14.3%,均达到央企优秀水平。进一', 'sort_num': 7838.26896}
{'top': 134.73104, 'buttom': 123.47408, 'page_num': 9, 'type': 'text', 'content': '二、报告期内公司所处行业情况 ', 'sort_num': 8865.26896}
{'top': 118.05104, 'buttom': 80.25504, 'page_num': 9, 'type': 'text', 'content': '国家高度重视培育数字经济、构建数字社会,数字中国、网络强国等战略从实践探索阶段发展至国家统筹策划、科学实施阶段。粮食安全、农业强国、乡村振兴等国家战略的实施,以及深化税收征管改革等国家级重大部署中,均明确了顺应数字经济发展规律、加大信息技术创新应用', 'sort_num': 8881.94896}
{'top': 207.45104, 'buttom': 87.93504, 'page_num': 10, 'type': 'text', 'content': '航天信息公司作为航天科工集团控股企业、以信息安全技术为核心的国有科技型上市公司,自成立以来,坚持服务国家战略、服务国计民生,依托航天的技术优势、人才优势,加快完善中国特色现代企业制度,有效提升公司治理水平,切实增强企业改革发展活力,扎实推动企业高质量发展。经过 20 余年的发展壮大,逐渐成为行业内具有一定影响力的上市公司,核心竞争力主要表现在以下九个方面:有清晰的战略定位和明确的发展目标,有完善的技术与产品体系,有“科改示范企业”的专项改革政策,有建设世界一流专业领军企业的综合实力,有千万级的庞大用户群体,有国家和行业的完备顶级资质,有充裕的现金资产与强大的融资能力,有遍布全国的营销售后服务体系,有央企背景和航天品牌提供的丰沛资源与信用背书,有一支想干事、能干事、干成事的干部职工队伍。 ', 'sort_num': 9792.54896}

View File

@ -0,0 +1,22 @@
"","","适用(如)","",""
"非流动性资产处置损益,包括已计提资产减值准备的冲销部分","-236316.65","","232448.97","-46760.24"
"计入当期损益的政府补助,但与公司正常经营业务密切相关、符合国家政策规定、按照确定的标准享有、对公司损益产生持续影响的政府补助除外","4471155.00","","9188174.79","13052067.83"
"除同公司正常经营业务相关的有效套期保值业务外,非金融企业持有金融资产和金融负债产生的公允价值变动损益以及处置金融资产和金融负债产生的损益","13099776.76","","14132376.82","7256455.55"
"计入当期损益的对非金融企业收取的资金占用费","","","",""
"委托他人投资或管理资产的损益","","","",""
"对外委托贷款取得的损益","","","",""
"因不可抗力因素,如遭受自然灾害而产生的各项资产损失","-3826330.90","","",""
"单独进行减值测试的应收款项减值准备转回","","","",""
"企业取得子公司、联营企业及合营企业的投资成本小于取得投资时应享有被投资单位可辨认净资产公允价值产生的收益","","","",""
"同一控制下企业合并产生的子公司期初至合并日的当期净损益","","","",""
"非货币性资产交换损益","","","",""
"债务重组损益","","","",""
"企业因相关经营活动不再持续而发生的一次性费用,如安置职工的支出等","","","",""
"因税收、会计等法律、法规的调整对当期损益产生的一次性影响","","","",""
"因取消、修改股权激励计划一次性确认的股份支付费用","","","",""
"对于现金结算的股份支付,在可行权日之后,应付职工薪酬的公允价值变动产生的损益","","","",""
"采用公允价值模式进行后续计量的投资性房地产公允价值变动产生的损益","","","",""
"交易价格显失公允的交易产生的收益","","","",""
"与公司正常经营业务无关的或有事项产生的损益","","","",""
"受托经营取得的托管费收入","","","",""
"除上述各项之外的其他营业外收","-11648682.96","","-529596.32","34351.19"
1 适用(如)
2 非流动性资产处置损益,包括已计提资产减值准备的冲销部分 -236316.65 232448.97 -46760.24
3 计入当期损益的政府补助,但与公司正常经营业务密切相关、符合国家政策规定、按照确定的标准享有、对公司损益产生持续影响的政府补助除外 4471155.00 9188174.79 13052067.83
4 除同公司正常经营业务相关的有效套期保值业务外,非金融企业持有金融资产和金融负债产生的公允价值变动损益以及处置金融资产和金融负债产生的损益 13099776.76 14132376.82 7256455.55
5 计入当期损益的对非金融企业收取的资金占用费
6 委托他人投资或管理资产的损益
7 对外委托贷款取得的损益
8 因不可抗力因素,如遭受自然灾害而产生的各项资产损失 -3826330.90
9 单独进行减值测试的应收款项减值准备转回
10 企业取得子公司、联营企业及合营企业的投资成本小于取得投资时应享有被投资单位可辨认净资产公允价值产生的收益
11 同一控制下企业合并产生的子公司期初至合并日的当期净损益
12 非货币性资产交换损益
13 债务重组损益
14 企业因相关经营活动不再持续而发生的一次性费用,如安置职工的支出等
15 因税收、会计等法律、法规的调整对当期损益产生的一次性影响
16 因取消、修改股权激励计划一次性确认的股份支付费用
17 对于现金结算的股份支付,在可行权日之后,应付职工薪酬的公允价值变动产生的损益
18 采用公允价值模式进行后续计量的投资性房地产公允价值变动产生的损益
19 交易价格显失公允的交易产生的收益
20 与公司正常经营业务无关的或有事项产生的损益
21 受托经营取得的托管费收入
22 除上述各项之外的其他营业外收 -11648682.96 -529596.32 34351.19

View File

@ -0,0 +1,5 @@
"入和支出","","","",""
"其他符合非经常性损益定义的损益项目","","","-757389.60","-729432.00"
"减:所得税影响额","278940.19","","3339902.20","2935002.34"
"少数股东权益影响额(税后)","","","",""
"合计","1580661.06","","18926112.46","16631679.99"
1 入和支出
2 其他符合非经常性损益定义的损益项目 -757389.60 -729432.00
3 减:所得税影响额 278940.19 3339902.20 2935002.34
4 少数股东权益影响额(税后)
5 合计 1580661.06 18926112.46 16631679.99

View File

@ -0,0 +1,3 @@
"项目名称","期初余额","期末余额","当期变动","对当期利润的影响金额"
"交易性金融资产","390568609.77","175421746.58","-215146863.19","-146863.19"
"合计","390568609.77","175421746.58","-215146863.19","-146863.19"
1 项目名称 期初余额 期末余额 当期变动 对当期利润的影响金额
2 交易性金融资产 390568609.77 175421746.58 -215146863.19 -146863.19
3 合计 390568609.77 175421746.58 -215146863.19 -146863.19

View File

@ -0,0 +1 @@
"","","","","","","","","","",""
1

View File

@ -0,0 +1,5 @@
"序号","评价维度","指标","公司产品注册标准","2020版中国药典标准","欧洲药典9.0版标准"
"123","杂质含量","卵清蛋白含量","≤60ng/mL","≤200ng/mL","≤500ng/mL"
"","","蛋白质含量","≤360μg/mL","≤400μg/mL","≤600μg/mL"
"","","游离甲醛含量","≤25μg/mL","≤50μg/mL","≤200μg/mL"
"4","有效成分纯度","蛋白质含量/血凝素含量","≤3.0","≤4.5","≤6.0"
1 序号 评价维度 指标 公司产品注册标准 2020版中国药典标准 欧洲药典9.0版标准
2 123 杂质含量 卵清蛋白含量 ≤60ng/mL ≤200ng/mL ≤500ng/mL
3 蛋白质含量 ≤360μg/mL ≤400μg/mL ≤600μg/mL
4 游离甲醛含量 ≤25μg/mL ≤50μg/mL ≤200μg/mL
5 4 有效成分纯度 蛋白质含量/血凝素含量 ≤3.0 ≤4.5 ≤6.0

View File

@ -0,0 +1,8 @@
"","本年新增","本年新增","累计数量","累计数量"
"","申请数(个)","获得数(个)","申请数(个)","获得数(个)"
"发明专利","6","3","16","6"
"实用新型专利","2","","12","10"
"外观设计专利","","","",""
"软件著作权","","","",""
"其他","","","",""
"合计","8","3","28","16"
1 本年新增 本年新增 累计数量 累计数量
2 申请数(个) 获得数(个) 申请数(个) 获得数(个)
3 发明专利 6 3 16 6
4 实用新型专利 2 12 10
5 外观设计专利
6 软件著作权
7 其他
8 合计 8 3 28 16

View File

@ -0,0 +1,6 @@
"","本年度","上年度","变化幅度(%"
"费用化研发投入","15471820.82","32409476.90","-52.26"
"资本化研发投入","15990870.05","13732758.96","16.44"
"研发投入合计","31462690.87","46142235.86","-31.81"
"研发投入总额占营业收入比例(%","23.38","14.49","增加8.89个百分点"
"研发投入资本化的比重(%","50.82","29.76","增加21.06个百分点"
1 本年度 上年度 变化幅度(%)
2 费用化研发投入 15471820.82 32409476.90 -52.26
3 资本化研发投入 15990870.05 13732758.96 16.44
4 研发投入合计 31462690.87 46142235.86 -31.81
5 研发投入总额占营业收入比例(%) 23.38 14.49 增加8.89个百分点
6 研发投入资本化的比重(%) 50.82 29.76 增加21.06个百分点

View File

@ -0,0 +1,12 @@
"","","资规模","金额","金额","阶段性成果","到目标","水平","应用前景"
"1","冻干人用狂犬病疫苗Vero细胞","10000.00","1599.09","11578.76","注册申请中","获得生产批件","国内领先","用于预防狂犬病"
"2","四价流感病毒裂解疫苗(儿童)","33000.00","410.69","1481.50","III期临床试验前期准备中","获得生产批件","国内领先","用于预防流行性感冒"
"3","23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","22980.00","123.49","631.25","临床前研究","获得生产批件","国内领先","用于预防肺炎"
"4","冻干水痘减毒活疫苗","31975.00","225.03","946.69","临床前研究","获得生产批件","国内领先","用于预防水痘"
"5","四价流感病毒裂解疫苗(高剂量)","11745.00","110.64","1961.90","临床前研究","获得生产批件","国内领先","用于预防流行性感冒"
"6","重组带状疱疹疫苗","31975.00","168.99","429.68","临床前研究","获得生产批件","国内领先","用于预防带状疱疹"
"7","冻干人用狂犬病疫苗MRC-5细胞","27915.00","33.77","200.46","临床前研究","获得生产批件","国内领先","用于预防狂犬病"
"8","多价手足口病疫苗","29910.00","33.77","199.29","临床前研究","获得生产批件","国内领先","用于预防手足口病"
"9","注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","38910.00","33.49","350.71","临床前研究","获得生产批件","新药","实体瘤治疗"
"10","在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究","300.00","54.38","225.80","临床研究完成","获得注册批件","国内领先","预防流行性感冒"
"合计","/","238710.00","2793.34","18006.04","/","/","/","/"
1 资规模 金额 金额 阶段性成果 到目标 水平 应用前景
2 1 冻干人用狂犬病疫苗(Vero细胞) 10000.00 1599.09 11578.76 注册申请中 获得生产批件 国内领先 用于预防狂犬病
3 2 四价流感病毒裂解疫苗(儿童) 33000.00 410.69 1481.50 III期临床试验前期准备中 获得生产批件 国内领先 用于预防流行性感冒
4 3 23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗 22980.00 123.49 631.25 临床前研究 获得生产批件 国内领先 用于预防肺炎
5 4 冻干水痘减毒活疫苗 31975.00 225.03 946.69 临床前研究 获得生产批件 国内领先 用于预防水痘
6 5 四价流感病毒裂解疫苗(高剂量) 11745.00 110.64 1961.90 临床前研究 获得生产批件 国内领先 用于预防流行性感冒
7 6 重组带状疱疹疫苗 31975.00 168.99 429.68 临床前研究 获得生产批件 国内领先 用于预防带状疱疹
8 7 冻干人用狂犬病疫苗(MRC-5细胞) 27915.00 33.77 200.46 临床前研究 获得生产批件 国内领先 用于预防狂犬病
9 8 多价手足口病疫苗 29910.00 33.77 199.29 临床前研究 获得生产批件 国内领先 用于预防手足口病
10 9 注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液 38910.00 33.49 350.71 临床前研究 获得生产批件 新药 实体瘤治疗
11 10 在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究 300.00 54.38 225.80 临床研究完成 获得注册批件 国内领先 预防流行性感冒
12 合计 / 238710.00 2793.34 18006.04 / / / /

View File

@ -0,0 +1,6 @@
"基本情况","基本情况","基本情况"
"","本期数","上期数"
"公司研发人员的数量(人)","60","58"
"研发人员数量占公司总人数的比例(%","13.10","12.24"
"研发人员薪酬合计","1012.67","932.12"
"研发人员平均薪酬","16.88","16.07"
1 基本情况 基本情况 基本情况
2 本期数 上期数
3 公司研发人员的数量(人) 60 58
4 研发人员数量占公司总人数的比例(%) 13.10 12.24
5 研发人员薪酬合计 1012.67 932.12
6 研发人员平均薪酬 16.88 16.07

View File

@ -0,0 +1,14 @@
"研发人员学历结构","研发人员学历结构"
"学历结构类别","学历结构人数"
"博士研究生","3"
"硕士研究生","6"
"本科","40"
"专科","10"
"高中及以下","1"
"研发人员年龄结构","研发人员年龄结构"
"年龄结构类别","年龄结构人数"
"30岁以下不含30岁","29"
"30-40岁含30岁不含40岁","20"
"40-50岁含40岁不含50岁","5"
"50-60岁含50岁不含60岁","4"
"60岁及以上","2"
1 研发人员学历结构 研发人员学历结构
2 学历结构类别 学历结构人数
3 博士研究生 3
4 硕士研究生 6
5 本科 40
6 专科 10
7 高中及以下 1
8 研发人员年龄结构 研发人员年龄结构
9 年龄结构类别 年龄结构人数
10 30岁以下(不含30岁) 29
11 30-40岁(含30岁,不含40岁) 20
12 40-50岁(含40岁,不含50岁) 5
13 50-60岁(含50岁,不含60岁) 4
14 60岁及以上 2

View File

@ -0,0 +1,10 @@
"科目","本期数","上年同期数","变动比例(%"
"营业收入","134591377.00","318486074.97","-57.74"
"营业成本","29864436.32","50588057.11","-40.97"
"销售费用","77073744.58","107494355.33","-28.30"
"管理费用","58638054.44","60622550.89","-3.27"
"财务费用","42981.30","-355527.32","不适用"
"研发费用","15471820.82","32409476.90","-52.26"
"经营活动产生的现金流量净额","80904692.08","38595320.99","109.62"
"投资活动产生的现金流量净额","-187707765.08","112695639.52","-266.56"
"筹资活动产生的现金流量净额","2517734.96","-13250290.31","不适用"
1 科目 本期数 上年同期数 变动比例(%)
2 营业收入 134591377.00 318486074.97 -57.74
3 营业成本 29864436.32 50588057.11 -40.97
4 销售费用 77073744.58 107494355.33 -28.30
5 管理费用 58638054.44 60622550.89 -3.27
6 财务费用 42981.30 -355527.32 不适用
7 研发费用 15471820.82 32409476.90 -52.26
8 经营活动产生的现金流量净额 80904692.08 38595320.99 109.62
9 投资活动产生的现金流量净额 -187707765.08 112695639.52 -266.56
10 筹资活动产生的现金流量净额 2517734.96 -13250290.31 不适用

View File

@ -0,0 +1,2 @@
"主营业务分行业情况"
"营业收入营业成本毛利率毛利率分行业营业收入营业成本比上年增比上年增比上年(%)减(%)减(%)增减"
1 主营业务分行业情况
2 营业收入营业成本毛利率毛利率分行业营业收入营业成本比上年增比上年增比上年(%)减(%)减(%)增减

View File

@ -0,0 +1,11 @@
"%","%","%","%","%","%","%"
"减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点"
"主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况"
"分产品","营业收入","营业成本","毛利率(%","营业收入比上年增减(%","营业成本比上年增减(%","毛利率比上年增减(%"
"四价流感病毒裂解疫苗","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
"主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况"
"分地区","营业收入","营业成本","毛利率(%","营业收入比上年增减(%","营业成本比上年增减(%","毛利率比上年增减(%"
"国内","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
"主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况"
"销售模式","营业收入","营业成本","毛利率(%","营业收入比上年增减(%","营业成本比上年增减(%","毛利率比上年增减(%"
"直销","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
1 (%) (%) (%) (%) (%) (%) (%)
2 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点 减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点
3 主营业务分产品情况 主营业务分产品情况 主营业务分产品情况 主营业务分产品情况 主营业务分产品情况 主营业务分产品情况 主营业务分产品情况
4 分产品 营业收入 营业成本 毛利率(%) 营业收入比上年增减(%) 营业成本比上年增减(%) 毛利率比上年增减(%)
5 四价流感病毒裂解疫苗 134591377.00 29864436.32 77.81 -57.74 -40.97 减少6.31个百分点
6 主营业务分地区情况 主营业务分地区情况 主营业务分地区情况 主营业务分地区情况 主营业务分地区情况 主营业务分地区情况 主营业务分地区情况
7 分地区 营业收入 营业成本 毛利率(%) 营业收入比上年增减(%) 营业成本比上年增减(%) 毛利率比上年增减(%)
8 国内 134591377.00 29864436.32 77.81 -57.74 -40.97 减少6.31个百分点
9 主营业务分销售模式情况 主营业务分销售模式情况 主营业务分销售模式情况 主营业务分销售模式情况 主营业务分销售模式情况 主营业务分销售模式情况 主营业务分销售模式情况
10 销售模式 营业收入 营业成本 毛利率(%) 营业收入比上年增减(%) 营业成本比上年增减(%) 毛利率比上年增减(%)
11 直销 134591377.00 29864436.32 77.81 -57.74 -40.97 减少6.31个百分点

View File

@ -0,0 +1,2 @@
"主要产品","单位","生产量","销售量","库存量","生产量比上年增减(%","销售量比上年增减(%","库存量比上年增减(%"
"四价流感病毒裂解疫苗","瓶","2945705","1381358","1152015","-53.20","-51.74","-63.93"
1 主要产品 单位 生产量 销售量 库存量 生产量比上年增减(%) 销售量比上年增减(%) 库存量比上年增减(%)
2 四价流感病毒裂解疫苗 2945705 1381358 1152015 -53.20 -51.74 -63.93

View File

@ -0,0 +1,11 @@
"分行业","成本构成项目","本期金额","本期占总成本比例(%)","上年同期金额","上年同期占总成本比例(%)","本期金额较上年同期变动比例(%)","情况说明"
"生物制药","直接材料","11118814.64","37.23","12840750.18","25.38","-13.41",""
"","直接人工","1506181.29","5.04","2408448.11","4.76","-37.46","销量减少所致"
"","制造费用","9877150.51","33.07","16580810.13","32.78","-40.43",""
"","运输费用","7362289.88","24.66","18758048.69","37.08","-60.75",""
"分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况"
"分产品","成本构成项目","本期金额","本期占总成本比例(%)","上年同期金额","上年同期占总成本比例(%)","本期金额较上年同期变动比例(%)","情况说明"
"四价流感病毒裂解疫苗","直接材料","11118814.64","37.23","12840750.18","25.38","-13.41",""
"","直接人工","1506181.29","5.04","2408448.11","4.76","-37.46","销量减少所致"
"","制造费用","9877150.51","33.07","16580810.13","32.78","-40.43",""
"","运输费用","7362289.88","24.66","18758048.69","37.08","-60.75",""
1 分行业 成本构成项目 本期金额 本期占总成本比例(%) 上年同期金额 上年同期占总成本比例(%) 本期金额较上年同期变动比例(%) 情况说明
2 生物制药 直接材料 11118814.64 37.23 12840750.18 25.38 -13.41
3 直接人工 1506181.29 5.04 2408448.11 4.76 -37.46 销量减少所致
4 制造费用 9877150.51 33.07 16580810.13 32.78 -40.43
5 运输费用 7362289.88 24.66 18758048.69 37.08 -60.75
6 分产品情况 分产品情况 分产品情况 分产品情况 分产品情况 分产品情况 分产品情况 分产品情况
7 分产品 成本构成项目 本期金额 本期占总成本比例(%) 上年同期金额 上年同期占总成本比例(%) 本期金额较上年同期变动比例(%) 情况说明
8 四价流感病毒裂解疫苗 直接材料 11118814.64 37.23 12840750.18 25.38 -13.41
9 直接人工 1506181.29 5.04 2408448.11 4.76 -37.46 销量减少所致
10 制造费用 9877150.51 33.07 16580810.13 32.78 -40.43
11 运输费用 7362289.88 24.66 18758048.69 37.08 -60.75

View File

@ -0,0 +1,5 @@
"2","客户二","509.71","3.79","否"
"3","客户三","318.08","2.36","否"
"4","客户四","309.50","2.30","否"
"5","客户五","256.49","1.91","否"
"合计","/","2214.00","16.45","/"
1 2 客户二 509.71 3.79
2 3 客户三 318.08 2.36
3 4 客户四 309.50 2.30
4 5 客户五 256.49 1.91
5 合计 / 2214.00 16.45 /

View File

@ -0,0 +1,7 @@
"序号","供应商名称","采购额","占年度采购总额比例(%","是否与上市公司存在关联关系"
"1","供应商一","1599.68","15.59","否"
"2","供应商二","1084.77","10.57","否"
"3","供应商三","941.52","9.18","否"
"4","供应商四","885.84","8.63","否"
"5","供应商五","849.64","8.28","否"
"合计","/","5361.45","52.25","/"
1 序号 供应商名称 采购额 占年度采购总额比例(%) 是否与上市公司存在关联关系
2 1 供应商一 1599.68 15.59
3 2 供应商二 1084.77 10.57
4 3 供应商三 941.52 9.18
5 4 供应商四 885.84 8.63
6 5 供应商五 849.64 8.28
7 合计 / 5361.45 52.25 /

View File

@ -0,0 +1,5 @@
"科目","本期数","上年同期数","变动比例(%"
"销售费用","77073744.58","107494355.33","-28.30"
"管理费用","58638054.44","60622550.89","-3.27"
"财务费用","42981.30","-355527.32","不适用"
"研发费用","15471820.82","32409476.90","-52.26"
1 科目 本期数 上年同期数 变动比例(%)
2 销售费用 77073744.58 107494355.33 -28.30
3 管理费用 58638054.44 60622550.89 -3.27
4 财务费用 42981.30 -355527.32 不适用
5 研发费用 15471820.82 32409476.90 -52.26

View File

@ -0,0 +1,3 @@
"科目","本期数","上年同期数","变动比例(%"
"经营活动产生的现金流量净额","80904692.08","38595320.99","109.62"
"投资活动产生的现金流量净额","-187707765.08","112695639.52","-266.56"
1 科目 本期数 上年同期数 变动比例(%)
2 经营活动产生的现金流量净额 80904692.08 38595320.99 109.62
3 投资活动产生的现金流量净额 -187707765.08 112695639.52 -266.56

View File

@ -0,0 +1,13 @@
"项目名称","本期期末数","本期期末数占总资产的比例(%","上期期末数","上期期末数占总资产的比例(%","本期期末金额较上期期末变动比例(%","情况说明"
"货币资金","70443588.78","4.32","174728926.82","9.56","","-59.68说明1"
"交易性金融资产","175421746.58","10.75","390568609.77","21.38","","-55.09说明2"
"预付款项","2825253.64","0.17","5735966.10","0.31","","-50.74说明3"
"其他应收款","479099.87","0.03","542645.12","0.03","-11.71",""
"在建工程","649464436.15","39.81","619862948.00","33.93","4.78",""
"长期待摊费用","248564.85","0.02","1626952.89","0.09","","-84.72说明4"
"递延所得税资产","33313943.01","2.04","17752280.68","0.97","","87.66说明5"
"其他非流动资产","3358975.00","0.21","3888619.41","0.21","-13.62",""
"短期借款","64057597.23","3.93","42041861.11","2.30","","52.37说明6"
"应付账款","86670216.00","5.31","98922415.32","5.42","-12.39",""
"合同负债","0.00","0.00","50000.00","0.00","","-100.00说明7"
"应交税费","1046668.08","0.06","1168680.25","0.06","-10.44",""
1 项目名称 本期期末数 本期期末数占总资产的比例(%) 上期期末数 上期期末数占总资产的比例(%) 本期期末金额较上期期末变动比例(%) 情况说明
2 货币资金 70443588.78 4.32 174728926.82 9.56 -59.68说明1
3 交易性金融资产 175421746.58 10.75 390568609.77 21.38 -55.09说明2
4 预付款项 2825253.64 0.17 5735966.10 0.31 -50.74说明3
5 其他应收款 479099.87 0.03 542645.12 0.03 -11.71
6 在建工程 649464436.15 39.81 619862948.00 33.93 4.78
7 长期待摊费用 248564.85 0.02 1626952.89 0.09 -84.72说明4
8 递延所得税资产 33313943.01 2.04 17752280.68 0.97 87.66说明5
9 其他非流动资产 3358975.00 0.21 3888619.41 0.21 -13.62
10 短期借款 64057597.23 3.93 42041861.11 2.30 52.37说明6
11 应付账款 86670216.00 5.31 98922415.32 5.42 -12.39
12 合同负债 0.00 0.00 50000.00 0.00 -100.00说明7
13 应交税费 1046668.08 0.06 1168680.25 0.06 -10.44

View File

@ -0,0 +1,2 @@
"细分行业","主要治疗领域","药(产)品名称","注册分类","适应症或功能主治","是否处方药","是否属于中药保护品种(如涉及)","发明专利起止期限(如适用)","是否属于报告期内推出的新药(产)品","是否纳入国家基药目录","是否纳入国家医保目录","是否纳入省级医保目录"
"生物制药","预防流行性感冒","四价流感病毒裂解疫苗","预防用生物制品","预防流行性感冒","否","否","2020-05-05至2037-08-23","否","否","否","否"
1 细分行业 主要治疗领域 药(产)品名称 注册分类 适应症或功能主治 是否处方药 是否属于中药保护品种(如涉及) 发明专利起止期限(如适用) 是否属于报告期内推出的新药(产)品 是否纳入国家基药目录 是否纳入国家医保目录 是否纳入省级医保目录
2 生物制药 预防流行性感冒 四价流感病毒裂解疫苗 预防用生物制品 预防流行性感冒 2020-05-05至2037-08-23

View File

@ -0,0 +1,7 @@
"研发项目(含一致性评价项目)","药(产)品名称","注册分类","适应症或功能主治","是否处方药","是否属于中药保护品种(如涉及)","研发(注册)所处阶段"
"冻干人用狂犬病疫苗Vero细胞","冻干人用狂犬病疫苗Vero细胞","预防用生物制品3.3类","预防狂犬病","否","否","申报注册"
"四价流感病毒裂解疫苗(儿童)","四价流感病毒裂解疫苗(儿童)","预防用生物制品3.3类","预防流行性感冒","否","否","临床试验"
"23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","预防用生物制品3.3类","预防肺炎","否","否","临床前研究"
"冻干水痘减毒活疫苗","冻干水痘减毒活疫苗","预防用生物制品3.3类","预防水痘","否","否","临床前研究"
"四价流感病毒裂解疫苗(高剂量)","四价流感病毒裂解疫苗(高剂量)","预防用生物制品3.2类","预防流行性感冒","否","否","临床前研究"
"重组带状疱疹疫苗","重组带状疱疹疫苗","预防用生物制品3.3类","预防带状疱疹","否","否","临床前研究"
1 研发项目(含一致性评价项目) 药(产)品名称 注册分类 适应症或功能主治 是否处方药 是否属于中药保护品种(如涉及) 研发(注册)所处阶段
2 冻干人用狂犬病疫苗(Vero细胞) 冻干人用狂犬病疫苗(Vero细胞) 预防用生物制品3.3类 预防狂犬病 申报注册
3 四价流感病毒裂解疫苗(儿童) 四价流感病毒裂解疫苗(儿童) 预防用生物制品3.3类 预防流行性感冒 临床试验
4 23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗 23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗 预防用生物制品3.3类 预防肺炎 临床前研究
5 冻干水痘减毒活疫苗 冻干水痘减毒活疫苗 预防用生物制品3.3类 预防水痘 临床前研究
6 四价流感病毒裂解疫苗(高剂量) 四价流感病毒裂解疫苗(高剂量) 预防用生物制品3.2类 预防流行性感冒 临床前研究
7 重组带状疱疹疫苗 重组带状疱疹疫苗 预防用生物制品3.3类 预防带状疱疹 临床前研究

View File

@ -0,0 +1,3 @@
"冻干人用狂犬病疫苗MRC-5细胞","冻干人用狂犬病疫苗MRC-5细胞","预防用生物制品3.3类","预防狂犬病","否","否","临床前研究"
"多价手足口病疫苗","多价手足口病疫苗","预防用生物制品1.4类","预防多价手足口病","否","否","临床前研究"
"注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","治疗用生物制品1类","实体瘤治疗","否","否","临床前研究"
1 冻干人用狂犬病疫苗(MRC-5细胞) 冻干人用狂犬病疫苗(MRC-5细胞) 预防用生物制品3.3类 预防狂犬病 临床前研究
2 多价手足口病疫苗 多价手足口病疫苗 预防用生物制品1.4类 预防多价手足口病 临床前研究
3 注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液 注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液 治疗用生物制品1类 实体瘤治疗 临床前研究

View File

@ -0,0 +1,10 @@
"同行业可比公司","研发投入金额","研发投入占营业收入比例(%","研发投入占净资产比例(%","研发投入资本化比重(%"
"长春百克生物科技股份有限公司","19874.22","10.89","4.94","5.74"
"云南沃森生物技术股份有限公司","91061.04","22.14","9.74","14.74"
"华兰生物疫苗股份有限公司","9321.33","3.87","1.49","2.26"
"康希诺生物股份公司","66167.10","185.3","12.51","3.58"
"北京万泰生物药业股份有限公司","129251.30","23.45","10.03","7.32"
"同行业平均研发投入金额","同行业平均研发投入金额","53136.88","53136.88","53136.88"
"公司报告期内研发投入占营业收入比例(%","公司报告期内研发投入占营业收入比例(%","23.38","23.38","23.38"
"公司报告期内研发投入占净资产比例(%","公司报告期内研发投入占净资产比例(%","2.29","2.29","2.29"
"公司报告期内研发投入资本化比重(%","公司报告期内研发投入资本化比重(%","50.82","50.82","50.82"
1 同行业可比公司 研发投入金额 研发投入占营业收入比例(%) 研发投入占净资产比例(%) 研发投入资本化比重(%)
2 长春百克生物科技股份有限公司 19874.22 10.89 4.94 5.74
3 云南沃森生物技术股份有限公司 91061.04 22.14 9.74 14.74
4 华兰生物疫苗股份有限公司 9321.33 3.87 1.49 2.26
5 康希诺生物股份公司 66167.10 185.3 12.51 3.58
6 北京万泰生物药业股份有限公司 129251.30 23.45 10.03 7.32
7 同行业平均研发投入金额 同行业平均研发投入金额 53136.88 53136.88 53136.88
8 公司报告期内研发投入占营业收入比例(%) 公司报告期内研发投入占营业收入比例(%) 23.38 23.38 23.38
9 公司报告期内研发投入占净资产比例(%) 公司报告期内研发投入占净资产比例(%) 2.29 2.29 2.29
10 公司报告期内研发投入资本化比重(%) 公司报告期内研发投入资本化比重(%) 50.82 50.82 50.82

View File

@ -0,0 +1,11 @@
"研发项目","研发投入金额","研发投入费用化金额","研发投入资本化金额","研发投入占营业收入比例(%","本期金额较上年同期变动比例(%","情况说明"
"冻干人用狂犬病疫苗Vero细胞","1599.09","","1599.09","11.88","16.44",""
"四价流感病毒裂解疫苗(儿童)","410.69","410.69","","3.05","349.70","本报告期该项目已完成期临床试验正在进行III期临床试验前期准备工作研发投入同比增加。"
"23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","123.49","123.49","","0.92","20.09",""
"冻干水痘减毒活疫苗","225.03","225.03","","1.67","-18.47",""
"四价流感病毒裂解疫苗(高剂量)","110.64","110.64","","0.82","-92.85","本报告期该项目处于临床前研究阶段,研发投入同比减少。"
"重组带状疱疹疫苗","168.99","168.99","","1.26","80.87","本报告期该项目处于临床前研究阶段,技术服务费研发投入同比增加。"
"冻干人用狂犬病疫苗MRC-5细胞","33.77","33.77","","0.25","-55.03","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
"多价手足口病疫苗","33.77","33.77","","0.25","-63.65","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
"注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","33.49","33.49","","0.25","-57.64","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
"在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究","54.38","54.38","","0.40","-24.28","本报告期该项目临床研究完成,研发投入同比减少。"
1 研发项目 研发投入金额 研发投入费用化金额 研发投入资本化金额 研发投入占营业收入比例(%) 本期金额较上年同期变动比例(%) 情况说明
2 冻干人用狂犬病疫苗(Vero细胞) 1599.09 1599.09 11.88 16.44
3 四价流感病毒裂解疫苗(儿童) 410.69 410.69 3.05 349.70 本报告期该项目已完成Ⅰ期临床试验,正在进行III期临床试验前期准备工作,研发投入同比增加。
4 23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗 123.49 123.49 0.92 20.09
5 冻干水痘减毒活疫苗 225.03 225.03 1.67 -18.47
6 四价流感病毒裂解疫苗(高剂量) 110.64 110.64 0.82 -92.85 本报告期该项目处于临床前研究阶段,研发投入同比减少。
7 重组带状疱疹疫苗 168.99 168.99 1.26 80.87 本报告期该项目处于临床前研究阶段,技术服务费研发投入同比增加。
8 冻干人用狂犬病疫苗(MRC-5细胞) 33.77 33.77 0.25 -55.03 本报告期该项目处于临床前研究阶段,研发投入同比增加。
9 多价手足口病疫苗 33.77 33.77 0.25 -63.65 本报告期该项目处于临床前研究阶段,研发投入同比增加。
10 注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液 33.49 33.49 0.25 -57.64 本报告期该项目处于临床前研究阶段,研发投入同比增加。
11 在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究 54.38 54.38 0.40 -24.28 本报告期该项目临床研究完成,研发投入同比减少。

View File

@ -0,0 +1,9 @@
"具体项目名称","本期发生额","本期发生额占销售费用总额比例(%"
"薪酬及社保费用","862.50","11.19"
"差旅费","66.75","0.87"
"业务招待费","35.21","0.46"
"销售服务费","6469.41","83.93"
"办公费","6.33","0.08"
"会议费","212.51","2.76"
"其他","54.66","0.71"
"合计","7707.37","100.00"
1 具体项目名称 本期发生额 本期发生额占销售费用总额比例(%)
2 薪酬及社保费用 862.50 11.19
3 差旅费 66.75 0.87
4 业务招待费 35.21 0.46
5 销售服务费 6469.41 83.93
6 办公费 6.33 0.08
7 会议费 212.51 2.76
8 其他 54.66 0.71
9 合计 7707.37 100.00

View File

@ -0,0 +1,8 @@
"同行业可比公司","销售费用","销售费用占营业收入比例(%"
"长春百克生物科技股份有限公司","64716.89","35.47"
"云南沃森生物技术股份有限公司","151957.55","36.94"
"华兰生物疫苗股份有限公司","94899.25","39.37"
"康希诺生物股份公司","35339.54","98.97"
"北京万泰生物药业股份有限公司","159509.44","28.94"
"公司报告期内销售费用总额","公司报告期内销售费用总额","7707.37"
"公司报告期内销售费用占营业收入比例(%","公司报告期内销售费用占营业收入比例(%","57.26"
1 同行业可比公司 销售费用 销售费用占营业收入比例(%)
2 长春百克生物科技股份有限公司 64716.89 35.47
3 云南沃森生物技术股份有限公司 151957.55 36.94
4 华兰生物疫苗股份有限公司 94899.25 39.37
5 康希诺生物股份公司 35339.54 98.97
6 北京万泰生物药业股份有限公司 159509.44 28.94
7 公司报告期内销售费用总额 公司报告期内销售费用总额 7707.37
8 公司报告期内销售费用占营业收入比例(%) 公司报告期内销售费用占营业收入比例(%) 57.26

View File

@ -0,0 +1,3 @@
"资产类别","期初数","本期公允价值变动损益","计入权益的累计公允价值变动","本期计提的减值","本期购买金额","本期出售/赎回金额","其他变动","期末数"
"其他","390568609.77","-146863.19","","","","215000000.00","","175421746.58"
"合计","390568609.77","-146863.19","","","","215000000.00","","175421746.58"
1 资产类别 期初数 本期公允价值变动损益 计入权益的累计公允价值变动 本期计提的减值 本期购买金额 本期出售/赎回金额 其他变动 期末数
2 其他 390568609.77 -146863.19 215000000.00 175421746.58
3 合计 390568609.77 -146863.19 215000000.00 175421746.58

View File

@ -0,0 +1,3 @@
"备查文件目录","载有公司负责人、主管会计工作负责人、会计机构负责人(会计主管人员)签名并盖章的财务报表"
"","载有会计师事务所盖章、注册会计师签名并盖章的审计报告原件"
"","报告期内公开披露过的所有公司文件的正本及公告的原稿。"
1 备查文件目录 载有公司负责人、主管会计工作负责人、会计机构负责人(会计主管人员)签名并盖章的财务报表
2 载有会计师事务所盖章、注册会计师签名并盖章的审计报告原件
3 报告期内公开披露过的所有公司文件的正本及公告的原稿。

View File

@ -0,0 +1,2 @@
"会议届次","召开日期","决议刊登的指定网站的查询索引","决议刊登的披露日期","会议决议"
"2022年年度股东大会","2023年5月10日","www.sse.com.cn","2023年5月11日","议案全部审议通过"
1 会议届次 召开日期 决议刊登的指定网站的查询索引 决议刊登的披露日期 会议决议
2 2022年年度股东大会 2023年5月10日 www.sse.com.cn 2023年5月11日 议案全部审议通过

View File

@ -0,0 +1,10 @@
"姓名","职务","性别","年龄","任期起始日期","任期终止日期","","年初持股数年末持股数","年度内股份增减变动量","增减变动原因","报告期内从公司获得的税前报酬总额(万元)","是否在公司关联方获取报酬"
"余军","董事长、核心技术人员","男","55","2020-06-15","2026-05-10","27049291","37869007","10819716","资本公积金转增股本","128.87否",""
"张良斌","董事","男","49","2020-06-15","2026-05-10","27049291","37869008","10819717","资本公积金转增股本","","0是"
"聂申钱","董事","男","76","2020-06-15","2026-05-10","3381159","4733623","1352464","资本公积金转增股本","","0是"
"夏建国","董事、副总经理","男","51","2020-06-15","2026-05-10","2086865","2921611","","834746不适用","88.87否",""
"邵蓉","独立董事女","","62","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
"管建强","独立董事男","","66","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
"程华(辞职)","独立董事女","","45","2020-06-15","2024-01-10","0","0","","0不适用","","12否"
"魏大昌","监事会主席","男","56","2020-06-15","2026-05-10","0","0","","0不适用","62.78否",""
"余晖晟","职工监事男","","28","2020-06-15","2026-05-10","0","0","","0不适用","8.79否",""
1 姓名 职务 性别 年龄 任期起始日期 任期终止日期 年初持股数年末持股数 年度内股份增减变动量 增减变动原因 报告期内从公司获得的税前报酬总额(万元) 是否在公司关联方获取报酬
2 余军 董事长、核心技术人员 55 2020-06-15 2026-05-10 27049291 37869007 10819716 资本公积金转增股本 128.87否
3 张良斌 董事 49 2020-06-15 2026-05-10 27049291 37869008 10819717 资本公积金转增股本 0是
4 聂申钱 董事 76 2020-06-15 2026-05-10 3381159 4733623 1352464 资本公积金转增股本 0是
5 夏建国 董事、副总经理 51 2020-06-15 2026-05-10 2086865 2921611 834746不适用 88.87否
6 邵蓉 独立董事女 62 2020-06-15 2026-05-10 0 0 0不适用 12否
7 管建强 独立董事男 66 2020-06-15 2026-05-10 0 0 0不适用 12否
8 程华(辞职) 独立董事女 45 2020-06-15 2024-01-10 0 0 0不适用 12否
9 魏大昌 监事会主席 56 2020-06-15 2026-05-10 0 0 0不适用 62.78否
10 余晖晟 职工监事男 28 2020-06-15 2026-05-10 0 0 0不适用 8.79否

View File

@ -0,0 +1,11 @@
"黄玲","监事","女","58","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
"张建辉","总经理","男","66","2023-10-27","2026-05-10","4057394","5680352","1622958","资本公积金转增股本","30.50是",""
"任晚琼(离职)","副总经理女","","54","2020-06-15","2023-10-27","0","0","","0不适用","88.86否",""
"樊长勇","副总经理男","","45","2020-06-15","2026-05-10","0","0","","0不适用","56.77否",""
"田国雄","副总经理男","","45","2022-05-30","2026-05-10","0","0","","0不适用","118.03否",""
"滕红刚(离职)","副总经理男","","52","2022-05-30","2023-05-10","0","0","","0不适用","29.31否",""
"黄强","财务总监男","","44","2022-04-08","2026-05-10","0","0","","0不适用","63.17否",""
"李志刚(离职)","副总经理男","","42","2023-10-27","2023-12-22","0","0","","0不适用","41.93否",""
"赵巍(离职)","副总经理男","","45","2023-10-27","2023-12-22","0","0","","0不适用","18.29否",""
"吴建华","核心技术人员","男","49","2009-10-01","-","0","0","","0不适用","40.91否",""
"合计","/","/","/","/","/","","","","/","825.08","/"
1 黄玲 监事 58 2020-06-15 2026-05-10 0 0 0不适用 12否
2 张建辉 总经理 66 2023-10-27 2026-05-10 4057394 5680352 1622958 资本公积金转增股本 30.50是
3 任晚琼(离职) 副总经理女 54 2020-06-15 2023-10-27 0 0 0不适用 88.86否
4 樊长勇 副总经理男 45 2020-06-15 2026-05-10 0 0 0不适用 56.77否
5 田国雄 副总经理男 45 2022-05-30 2026-05-10 0 0 0不适用 118.03否
6 滕红刚(离职) 副总经理男 52 2022-05-30 2023-05-10 0 0 0不适用 29.31否
7 黄强 财务总监男 44 2022-04-08 2026-05-10 0 0 0不适用 63.17否
8 李志刚(离职) 副总经理男 42 2023-10-27 2023-12-22 0 0 0不适用 41.93否
9 赵巍(离职) 副总经理男 45 2023-10-27 2023-12-22 0 0 0不适用 18.29否
10 吴建华 核心技术人员 49 2009-10-01 - 0 0 0不适用 40.91否
11 合计 / / / / / / 825.08 /

View File

@ -0,0 +1,3 @@
"姓名","主要工作经历"
"余军","1992年8月至1993年10月任临川中心血站技术员1993年11月至2000年12月任博雅生物制药股份有限公司生产经理2001年1月至2002年5月任北京耀华生物技术有限公司总工程师2002年6月至2005年7月任广东佰易药业有限公司副总经理2005年10月至2014年3月任同路生物制药有限公司副总经理2014年6月至2015年5月任海南中和药业有限公司副总经理2015年6月至2023年10月担任公司核心技术人员、董事长、总经理2023年10月至今担任公司核心技术人员、董事长。"
"张良斌","1999年2月至2000年5月任博雅生物制药股份有限公司出纳2000年5月至2001年10月任广东康之选医药连锁有限公司配送中心经理2001年10月至2005年12月任广东佰易药业有限公司销售部经理2006年1月至今任同路生物制药有限公司副总经理2017"
1 姓名 主要工作经历
2 余军 1992年8月至1993年10月任临川中心血站技术员;1993年11月至2000年12月任博雅生物制药股份有限公司生产经理;2001年1月至2002年5月任北京耀华生物技术有限公司总工程师;2002年6月至2005年7月任广东佰易药业有限公司副总经理;2005年10月至2014年3月任同路生物制药有限公司副总经理;2014年6月至2015年5月任海南中和药业有限公司副总经理;2015年6月至2023年10月担任公司核心技术人员、董事长、总经理,2023年10月至今担任公司核心技术人员、董事长。
3 张良斌 1999年2月至2000年5月任博雅生物制药股份有限公司出纳;2000年5月至2001年10月任广东康之选医药连锁有限公司配送中心经理;2001年10月至2005年12月任广东佰易药业有限公司销售部经理;2006年1月至今任同路生物制药有限公司副总经理;2017

View File

@ -0,0 +1,12 @@
"","年2月至今任浙江海康生物制品有限责任公司董事2016年8月至今担任广东上量投资有限公司监事2015年6月至今任公司董事。"
"聂申钱","1969年2月至1987年11月任中国人民解放军海军航空兵部队干部1987年12月至1993年8月任中国预防医学科学院中预公司经理1993年9月至2011年9月任中信医药实业有限公司总经理2011年10月至2013年12月任上药科园信海医药有限公司党委书记2014年7月至2016年11月任海南中和药业有限公司董事兼总经理2016年11月至今任海南中和药业股份有限公司董事、高级顾问2020年4月至今担任海南妙峰山健康产业有限公司执行董事兼总经理2015年6月至今任公司董事。"
"夏建国","1995年8月至1998年8月任南京药械厂制药机械研究所设计师1998年8月至2000年12月任博雅生物制药股份有限公司冻干技师2001年1月至2002年8月任深圳海普瑞生物技术有限公司工程部主管2002年9月至2005年12月任广东佰易药业有限公司工程部经理2006年1月至2015年5月任同路生物制药有限公司项目总监2015年6月至今担任公司董事、副总经理。"
"邵蓉","2020年6月至今担任公司独立董事。现就职于中国药科大学任国家药物政策与医药产业经济研究中心执行副主任教授、博士生导师兼任天境生物I-Mab独立董事、江苏当代国安律师事务所执业律师、中国药学会理事、中国药品监督管理研究会政策与法规专业委员会主任委员、中国药促会监事等职。"
"管建强","2020年6月至今担任公司独立董事。现担任华东政法大学教授和博士生导师兼任江苏图南合金股份有限公司独立董事。"
"程华(辞职)","2020年6月至2023年12月担任公司独立董事。现担任财政部会计准则委员会高级会计师兼任中国财政科学研究院硕士生导师、湘财股份有限公司独立董事、悦康药业集团股份有限公司独立董事、山东步长制药股份有限公司独立董事等职。"
"魏大昌","1988年11月至1993年9月任成都军区后勤部供血站精制组长1993年9月至1998年11月任江西省博达生物工程研究所工程师1998年11月至2005年3月任广东湛江双林生物制药有限公司总经理助理兼生产部部长2005年3月至2005年10月任广东佰易药业有限公司生产部经理2005年11月至2016年6月任同路生物制药有限公司生产部经理2016年6月至2018年5月任中科生物制药有限公司血制项目总监2018年6月至2019年6月任通盈生物制药有限公司血制项目总监2019年7月至今任公司包装部经理总监2020年6月至今任公司监事会主席。"
"余晖晟","2017年2月至今为公司车间员工2020年6月至今任公司职工代表监事。"
"黄玲","1988年9月至1993年10月任北京东风制药厂技术员1993年10月至1995年8月任北京亚都生物公司技术员1997年10月至2003年3月任北京巨能公司研究员2003年3月至今任北京秦脉医药咨询有限责任公司咨询师2020年6月至今任公司监事。"
"张建辉","1975年1月至1997年12月在江西省抚州地区煤炭公司任职1998年1月至2007年1月任江西省崇仁县单采血浆站站长2007年2月至2009年1月任博雅生物制药股份有限公司副总经理2009年2月至2011年12月任同路生物制药有限公司血浆部副总经理2012年1月至今任郴州市云鼎房地产有限公司董事长2020年5月至今任福建省宏冠房地产开发有限公司董事长2023年10月至今担任公司总经理。"
"任晚琼(离职)","1993年8月至2010年7月任职于河南欣泰药业有限公司历任质检科职员、质检科主任、质量保证部部长、副总经理2010年8月至2015年2月任河南远大生物制药有限公司副总经理2015年6月至2019年6月任公司质量总监2019年6月至2023年10月任公司副总经理。"
"樊长勇","2001年7月至2004年1月任上海九鼎粉体材料有限公司技术员2004年1月至2007年7月任上海界龙实业股份有限公司高级经理2007年7月至2009年8月任国信证券股份有限公司投资银行高级经理2009年9月至2015年6月任中信证券股份有限公司投资银行"
1 年2月至今任浙江海康生物制品有限责任公司董事;2016年8月至今担任广东上量投资有限公司监事;2015年6月至今任公司董事。
2 聂申钱 1969年2月至1987年11月任中国人民解放军海军航空兵部队干部;1987年12月至1993年8月任中国预防医学科学院中预公司经理;1993年9月至2011年9月任中信医药实业有限公司总经理;2011年10月至2013年12月任上药科园信海医药有限公司党委书记;2014年7月至2016年11月任海南中和药业有限公司董事兼总经理;2016年11月至今任海南中和药业股份有限公司董事、高级顾问;2020年4月至今担任海南妙峰山健康产业有限公司执行董事兼总经理;2015年6月至今任公司董事。
3 夏建国 1995年8月至1998年8月任南京药械厂制药机械研究所设计师;1998年8月至2000年12月任博雅生物制药股份有限公司冻干技师;2001年1月至2002年8月任深圳海普瑞生物技术有限公司工程部主管;2002年9月至2005年12月任广东佰易药业有限公司工程部经理;2006年1月至2015年5月任同路生物制药有限公司项目总监;2015年6月至今担任公司董事、副总经理。
4 邵蓉 2020年6月至今担任公司独立董事。现就职于中国药科大学,任国家药物政策与医药产业经济研究中心执行副主任,教授、博士生导师,兼任天境生物(I-Mab)独立董事、江苏当代国安律师事务所执业律师、中国药学会理事、中国药品监督管理研究会政策与法规专业委员会主任委员、中国药促会监事等职。
5 管建强 2020年6月至今担任公司独立董事。现担任华东政法大学教授和博士生导师,兼任江苏图南合金股份有限公司独立董事。
6 程华(辞职) 2020年6月至2023年12月,担任公司独立董事。现担任财政部会计准则委员会高级会计师,兼任中国财政科学研究院硕士生导师、湘财股份有限公司独立董事、悦康药业集团股份有限公司独立董事、山东步长制药股份有限公司独立董事等职。
7 魏大昌 1988年11月至1993年9月,任成都军区后勤部供血站精制组长;1993年9月至1998年11月,任江西省博达生物工程研究所工程师;1998年11月至2005年3月,任广东湛江双林生物制药有限公司总经理助理兼生产部部长;2005年3月至2005年10月,任广东佰易药业有限公司生产部经理;2005年11月至2016年6月,任同路生物制药有限公司生产部经理;2016年6月至2018年5月,任中科生物制药有限公司血制项目总监;2018年6月至2019年6月,任通盈生物制药有限公司血制项目总监;2019年7月至今任公司包装部经理(总监),2020年6月至今任公司监事会主席。
8 余晖晟 2017年2月至今为公司车间员工,2020年6月至今任公司职工代表监事。
9 黄玲 1988年9月至1993年10月任北京东风制药厂技术员;1993年10月至1995年8月,任北京亚都生物公司技术员;1997年10月至2003年3月,任北京巨能公司研究员;2003年3月至今任北京秦脉医药咨询有限责任公司咨询师;2020年6月至今任公司监事。
10 张建辉 1975年1月至1997年12月在江西省抚州地区煤炭公司任职;1998年1月至2007年1月任江西省崇仁县单采血浆站站长;2007年2月至2009年1月任博雅生物制药股份有限公司副总经理;2009年2月至2011年12月任同路生物制药有限公司血浆部副总经理;2012年1月至今任郴州市云鼎房地产有限公司董事长;2020年5月至今任福建省宏冠房地产开发有限公司董事长,2023年10月至今担任公司总经理。
11 任晚琼(离职) 1993年8月至2010年7月任职于河南欣泰药业有限公司,历任质检科职员、质检科主任、质量保证部部长、副总经理;2010年8月至2015年2月任河南远大生物制药有限公司副总经理;2015年6月至2019年6月任公司质量总监;2019年6月至2023年10月任公司副总经理。
12 樊长勇 2001年7月至2004年1月任上海九鼎粉体材料有限公司技术员;2004年1月至2007年7月任上海界龙实业股份有限公司高级经理;2007年7月至2009年8月任国信证券股份有限公司投资银行高级经理;2009年9月至2015年6月任中信证券股份有限公司投资银行

View File

@ -0,0 +1,7 @@
"","委员会副总裁VP、高级副总裁SVP、保荐代表人2016年4月至2020年4月任上海莱士血液制品股份有限公司董事长助理2018年9月至2020年4月任同方莱士医药产业投资广东有限公司总经理2020年5月至2024年1月任公司副总经理兼董事会秘书。"
"田国雄","2002年7月至2005年12月先后任广东佰易药业有限公司地区商务经理、地区销售经理2006年1月至2022年3月先后任同路生物制药有限公司地区销售经理、大区销售经理、大区销售总监。2022年5月起任公司副总经理。"
"滕红刚(离职)","1995年9月至2000年7月在长春生物制品所病毒研究室工作2003年7月在长春生物制品所获得免疫学硕士学位2006年7月在吉林大学生命科学学院获得生物化学与分子生物学专业博士学位2006年10月至2007年4月任中国科学院广州生物医药与健康研究院研究助理2007年6月至2009年6月任吉林亚泰生物药业股份有限公司副总经理2009年6月至2011年10月任鸿达生物药业长春股份有限公司副总经理2011年12月至2015年5月任长春卫尔赛生物药业有限公司生产总监2015年8月至2016年5月任霍普金斯医药研究院长春分院院长2016年8月至2022年3月先后任辽宁依生生物制药有限公司副总经理、总经理。2022年5月至2023年5月任公司副总经理。"
"黄强","2002年7月至2016年3月历任河南神火煤电股份有限公司000933.SZ财务部科员、副科长、科长2016年4月至2021年3月历任海南中和药业股份有限公司证券事务代表、董事会办公室主任、财务副总监、总经理助理2021年4月至2022年4月任江苏金迪克生物技术股份有限公司财务副总监。2022年4月起任公司财务总监。"
"李志刚(离职)","2008年3月至2010年3月任牛津大学高级研究助理2010年8月至2012年10月任北京必威安泰生物科技有限公司研发项目负责人2012年10月至2018年3月任北京生物制品研究所有限公司经理、副主任等职2018年4月至2019年7月任中国生物技术股份有限公司部长助理2019年8月至2021年5月任北京民海生物科技有限公司质量合规总监2021年5月至2022年4月任斯微上海生物科技有限公司副总裁2022年5月至2023年10月任君拓生物医药科技海南有限公司副总裁。2023年10月至2023年12月担任公司副总经理。"
"赵巍(离职)","2000年10月至2002年9月任武汉海特生物制药股份有限公司员工2002年9月至2004年6月就读于华中科技大学同济医学院获学士学位2005年9月至2007年6月就读于武汉大学获硕士学位2007年7月至2023年3月历任武汉生物制品研究所有限责任公司流感疫苗课题组第二课题负责人、病毒性疫苗研究二室主任、流感病毒疫苗室主任。2023年3月至2023年10月任上海君拓生物医药科技有限公司总裁助理兼无锡君和生物医药科技有限公司副总经理。2023年10月至2023年12月担任公司副总经理。"
"吴建华","1998年6月至2003年3月任浙江天元生物药业股份有限公司生产技术员、研发助理工程师2003年3月至2009年9月任北京金迪克生物技术研究所研发主管2009年10月至今任公司质量控制部经理。吴建华主要负责公司四价流感病毒裂解疫苗的临床前和临床试验研究、生产工艺研究和质量控制研究工作以及公司冻干人用狂犬病疫苗Vero细胞、四价流感病毒裂解疫苗儿童、四价流感病毒裂解疫苗高剂量、冻干水痘减毒活疫苗、冻干带状疱疹减毒活疫苗等在研项目的质量控制研究工作。"
1 委员会副总裁(VP)、高级副总裁(SVP)、保荐代表人;2016年4月至2020年4月任上海莱士血液制品股份有限公司董事长助理,2018年9月至2020年4月任同方莱士医药产业投资(广东)有限公司总经理;2020年5月至2024年1月任公司副总经理兼董事会秘书。
2 田国雄 2002年7月至2005年12月先后任广东佰易药业有限公司地区商务经理、地区销售经理,2006年1月至2022年3月先后任同路生物制药有限公司地区销售经理、大区销售经理、大区销售总监。2022年5月起任公司副总经理。
3 滕红刚(离职) 1995年9月至2000年7月在长春生物制品所病毒研究室工作,2003年7月在长春生物制品所获得免疫学硕士学位,2006年7月在吉林大学生命科学学院获得生物化学与分子生物学专业博士学位,2006年10月至2007年4月任中国科学院广州生物医药与健康研究院研究助理,2007年6月至2009年6月任吉林亚泰生物药业股份有限公司副总经理,2009年6月至2011年10月任鸿达生物药业长春股份有限公司副总经理,2011年12月至2015年5月任长春卫尔赛生物药业有限公司生产总监,2015年8月至2016年5月任霍普金斯医药研究院长春分院院长,2016年8月至2022年3月先后任辽宁依生生物制药有限公司副总经理、总经理。2022年5月至2023年5月任公司副总经理。
4 黄强 2002年7月至2016年3月历任河南神火煤电股份有限公司(000933.SZ)财务部科员、副科长、科长;2016年4月至2021年3月历任海南中和药业股份有限公司证券事务代表、董事会办公室主任、财务副总监、总经理助理;2021年4月至2022年4月任江苏金迪克生物技术股份有限公司财务副总监。2022年4月起任公司财务总监。
5 李志刚(离职) 2008年3月至2010年3月任牛津大学高级研究助理;2010年8月至2012年10月任北京必威安泰生物科技有限公司研发项目负责人;2012年10月至2018年3月任北京生物制品研究所有限公司经理、副主任等职;2018年4月至2019年7月任中国生物技术股份有限公司部长助理;2019年8月至2021年5月任北京民海生物科技有限公司质量合规总监;2021年5月至2022年4月任斯微(上海)生物科技有限公司副总裁;2022年5月至2023年10月任君拓生物医药科技(海南)有限公司副总裁。2023年10月至2023年12月担任公司副总经理。
6 赵巍(离职) 2000年10月至2002年9月任武汉海特生物制药股份有限公司员工;2002年9月至2004年6月就读于华中科技大学同济医学院,获学士学位;2005年9月至2007年6月就读于武汉大学,获硕士学位;2007年7月至2023年3月历任武汉生物制品研究所有限责任公司流感疫苗课题组第二课题负责人、病毒性疫苗研究二室主任、流感病毒疫苗室主任。2023年3月至2023年10月任上海君拓生物医药科技有限公司总裁助理(兼无锡君和生物医药科技有限公司副总经理)。2023年10月至2023年12月担任公司副总经理。
7 吴建华 1998年6月至2003年3月,任浙江天元生物药业股份有限公司生产技术员、研发助理工程师;2003年3月至2009年9月,任北京金迪克生物技术研究所研发主管;2009年10月至今任公司质量控制部经理。吴建华主要负责公司四价流感病毒裂解疫苗的临床前和临床试验研究、生产工艺研究和质量控制研究工作,以及公司冻干人用狂犬病疫苗(Vero细胞)、四价流感病毒裂解疫苗(儿童)、四价流感病毒裂解疫苗(高剂量)、冻干水痘减毒活疫苗、冻干带状疱疹减毒活疫苗等在研项目的质量控制研究工作。

View File

@ -0,0 +1,4 @@
"任职人员姓名","股东单位名称","在股东单位担任的职务","","任期起始日期任期终止日期"
"余军","泰州同泽","执行事务合伙人","2020年5月",""
"张良斌","泰州同人","执行事务合伙人","2020年5月",""
"在股东单位任职情况的说明","不适用","不适用","不适用","不适用"
1 任职人员姓名 股东单位名称 在股东单位担任的职务 任期起始日期任期终止日期
2 余军 泰州同泽 执行事务合伙人 2020年5月
3 张良斌 泰州同人 执行事务合伙人 2020年5月
4 在股东单位任职情况的说明 不适用 不适用 不适用 不适用

Some files were not shown because too many files have changed in this diff Show More