添加 word 版本代码
This commit is contained in:
parent
15e33eadf7
commit
dd5ee3722e
|
@ -1,7 +1,5 @@
|
|||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
|
||||
from config import MILVUS_CLIENT
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
def create_partition_by_hour(current_hour):
|
||||
# 连接到 Milvus 服务器
|
||||
|
@ -28,6 +26,7 @@ def create_partition_by_hour(current_hour):
|
|||
pre_partition.release()
|
||||
collection.drop_partition(name)
|
||||
print(f"Partition '{name}' deleted.")
|
||||
connections.disconnect("default")
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
*.pyc
|
||||
*.vscode
|
||||
__pycache__/
|
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding">
|
||||
<file url="file://$PROJECT_DIR$/log-day/sec.log" charset="GBK" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,168 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="155">
|
||||
<item index="0" class="java.lang.String" itemvalue="pandas" />
|
||||
<item index="1" class="java.lang.String" itemvalue="protobuf" />
|
||||
<item index="2" class="java.lang.String" itemvalue="decorator" />
|
||||
<item index="3" class="java.lang.String" itemvalue="TA-Lib" />
|
||||
<item index="4" class="java.lang.String" itemvalue="websocket-client" />
|
||||
<item index="5" class="java.lang.String" itemvalue="altgraph" />
|
||||
<item index="6" class="java.lang.String" itemvalue="tzlocal" />
|
||||
<item index="7" class="java.lang.String" itemvalue="Babel" />
|
||||
<item index="8" class="java.lang.String" itemvalue="testpath" />
|
||||
<item index="9" class="java.lang.String" itemvalue="pickleshare" />
|
||||
<item index="10" class="java.lang.String" itemvalue="psycopg2" />
|
||||
<item index="11" class="java.lang.String" itemvalue="defusedxml" />
|
||||
<item index="12" class="java.lang.String" itemvalue="lml" />
|
||||
<item index="13" class="java.lang.String" itemvalue="PyQt5-sip" />
|
||||
<item index="14" class="java.lang.String" itemvalue="javascripthon" />
|
||||
<item index="15" class="java.lang.String" itemvalue="ipython-genutils" />
|
||||
<item index="16" class="java.lang.String" itemvalue="tables" />
|
||||
<item index="17" class="java.lang.String" itemvalue="rqdatac" />
|
||||
<item index="18" class="java.lang.String" itemvalue="Pygments" />
|
||||
<item index="19" class="java.lang.String" itemvalue="PyQt5" />
|
||||
<item index="20" class="java.lang.String" itemvalue="bleach" />
|
||||
<item index="21" class="java.lang.String" itemvalue="graphviz" />
|
||||
<item index="22" class="java.lang.String" itemvalue="jsonschema" />
|
||||
<item index="23" class="java.lang.String" itemvalue="pywin32" />
|
||||
<item index="24" class="java.lang.String" itemvalue="qtconsole" />
|
||||
<item index="25" class="java.lang.String" itemvalue="terminado" />
|
||||
<item index="26" class="java.lang.String" itemvalue="portalocker" />
|
||||
<item index="27" class="java.lang.String" itemvalue="Werkzeug" />
|
||||
<item index="28" class="java.lang.String" itemvalue="aniso8601" />
|
||||
<item index="29" class="java.lang.String" itemvalue="mxnet" />
|
||||
<item index="30" class="java.lang.String" itemvalue="jupyter-client" />
|
||||
<item index="31" class="java.lang.String" itemvalue="QDarkStyle" />
|
||||
<item index="32" class="java.lang.String" itemvalue="ipykernel" />
|
||||
<item index="33" class="java.lang.String" itemvalue="nbconvert" />
|
||||
<item index="34" class="java.lang.String" itemvalue="attrs" />
|
||||
<item index="35" class="java.lang.String" itemvalue="pefile" />
|
||||
<item index="36" class="java.lang.String" itemvalue="psutil" />
|
||||
<item index="37" class="java.lang.String" itemvalue="pyinstaller-hooks-contrib" />
|
||||
<item index="38" class="java.lang.String" itemvalue="PyQtWebEngine" />
|
||||
<item index="39" class="java.lang.String" itemvalue="simplejson" />
|
||||
<item index="40" class="java.lang.String" itemvalue="prettytable" />
|
||||
<item index="41" class="java.lang.String" itemvalue="jedi" />
|
||||
<item index="42" class="java.lang.String" itemvalue="helpdev" />
|
||||
<item index="43" class="java.lang.String" itemvalue="pyqtgraph" />
|
||||
<item index="44" class="java.lang.String" itemvalue="dukpy" />
|
||||
<item index="45" class="java.lang.String" itemvalue="futu-api" />
|
||||
<item index="46" class="java.lang.String" itemvalue="matplotlib" />
|
||||
<item index="47" class="java.lang.String" itemvalue="humanize" />
|
||||
<item index="48" class="java.lang.String" itemvalue="PyMySQL" />
|
||||
<item index="49" class="java.lang.String" itemvalue="msgpack" />
|
||||
<item index="50" class="java.lang.String" itemvalue="idna" />
|
||||
<item index="51" class="java.lang.String" itemvalue="rsa" />
|
||||
<item index="52" class="java.lang.String" itemvalue="vnstation" />
|
||||
<item index="53" class="java.lang.String" itemvalue="pandocfilters" />
|
||||
<item index="54" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="55" class="java.lang.String" itemvalue="pyasn1" />
|
||||
<item index="56" class="java.lang.String" itemvalue="requests" />
|
||||
<item index="57" class="java.lang.String" itemvalue="pyrsistent" />
|
||||
<item index="58" class="java.lang.String" itemvalue="gluoncv" />
|
||||
<item index="59" class="java.lang.String" itemvalue="jdcal" />
|
||||
<item index="60" class="java.lang.String" itemvalue="jupyter" />
|
||||
<item index="61" class="java.lang.String" itemvalue="seaborn" />
|
||||
<item index="62" class="java.lang.String" itemvalue="zipp" />
|
||||
<item index="63" class="java.lang.String" itemvalue="prompt-toolkit" />
|
||||
<item index="64" class="java.lang.String" itemvalue="tigeropen" />
|
||||
<item index="65" class="java.lang.String" itemvalue="itsdangerous" />
|
||||
<item index="66" class="java.lang.String" itemvalue="pyee" />
|
||||
<item index="67" class="java.lang.String" itemvalue="deap" />
|
||||
<item index="68" class="java.lang.String" itemvalue="websockets" />
|
||||
<item index="69" class="java.lang.String" itemvalue="ipywidgets" />
|
||||
<item index="70" class="java.lang.String" itemvalue="scipy" />
|
||||
<item index="71" class="java.lang.String" itemvalue="tornado" />
|
||||
<item index="72" class="java.lang.String" itemvalue="pyppeteer" />
|
||||
<item index="73" class="java.lang.String" itemvalue="Send2Trash" />
|
||||
<item index="74" class="java.lang.String" itemvalue="et-xmlfile" />
|
||||
<item index="75" class="java.lang.String" itemvalue="incremental" />
|
||||
<item index="76" class="java.lang.String" itemvalue="mistune" />
|
||||
<item index="77" class="java.lang.String" itemvalue="cnocr" />
|
||||
<item index="78" class="java.lang.String" itemvalue="future" />
|
||||
<item index="79" class="java.lang.String" itemvalue="mpmath" />
|
||||
<item index="80" class="java.lang.String" itemvalue="jupyter-console" />
|
||||
<item index="81" class="java.lang.String" itemvalue="macropy3" />
|
||||
<item index="82" class="java.lang.String" itemvalue="pycryptodome" />
|
||||
<item index="83" class="java.lang.String" itemvalue="pytz" />
|
||||
<item index="84" class="java.lang.String" itemvalue="setproctitle" />
|
||||
<item index="85" class="java.lang.String" itemvalue="webencodings" />
|
||||
<item index="86" class="java.lang.String" itemvalue="Pillow" />
|
||||
<item index="87" class="java.lang.String" itemvalue="Twisted" />
|
||||
<item index="88" class="java.lang.String" itemvalue="traitlets" />
|
||||
<item index="89" class="java.lang.String" itemvalue="Automat" />
|
||||
<item index="90" class="java.lang.String" itemvalue="pywinpty" />
|
||||
<item index="91" class="java.lang.String" itemvalue="python-dateutil" />
|
||||
<item index="92" class="java.lang.String" itemvalue="Brotli" />
|
||||
<item index="93" class="java.lang.String" itemvalue="Click" />
|
||||
<item index="94" class="java.lang.String" itemvalue="cycler" />
|
||||
<item index="95" class="java.lang.String" itemvalue="MarkupSafe" />
|
||||
<item index="96" class="java.lang.String" itemvalue="twisted-iocpsupport" />
|
||||
<item index="97" class="java.lang.String" itemvalue="constantly" />
|
||||
<item index="98" class="java.lang.String" itemvalue="mongoengine" />
|
||||
<item index="99" class="java.lang.String" itemvalue="appdirs" />
|
||||
<item index="100" class="java.lang.String" itemvalue="docopt" />
|
||||
<item index="101" class="java.lang.String" itemvalue="ibapi" />
|
||||
<item index="102" class="java.lang.String" itemvalue="pymssql" />
|
||||
<item index="103" class="java.lang.String" itemvalue="pyzmq" />
|
||||
<item index="104" class="java.lang.String" itemvalue="certifi" />
|
||||
<item index="105" class="java.lang.String" itemvalue="entrypoints" />
|
||||
<item index="106" class="java.lang.String" itemvalue="peewee" />
|
||||
<item index="107" class="java.lang.String" itemvalue="pyparsing" />
|
||||
<item index="108" class="java.lang.String" itemvalue="sympy" />
|
||||
<item index="109" class="java.lang.String" itemvalue="notebook" />
|
||||
<item index="110" class="java.lang.String" itemvalue="hyperlink" />
|
||||
<item index="111" class="java.lang.String" itemvalue="win-unicode-console" />
|
||||
<item index="112" class="java.lang.String" itemvalue="kiwisolver" />
|
||||
<item index="113" class="java.lang.String" itemvalue="zope.interface" />
|
||||
<item index="114" class="java.lang.String" itemvalue="APScheduler" />
|
||||
<item index="115" class="java.lang.String" itemvalue="backcall" />
|
||||
<item index="116" class="java.lang.String" itemvalue="PySocks" />
|
||||
<item index="117" class="java.lang.String" itemvalue="widgetsnbextension" />
|
||||
<item index="118" class="java.lang.String" itemvalue="numexpr" />
|
||||
<item index="119" class="java.lang.String" itemvalue="pyecharts-snapshot" />
|
||||
<item index="120" class="java.lang.String" itemvalue="jupyter-core" />
|
||||
<item index="121" class="java.lang.String" itemvalue="pyecharts-jupyter-installer" />
|
||||
<item index="122" class="java.lang.String" itemvalue="Delorean" />
|
||||
<item index="123" class="java.lang.String" itemvalue="SQLAlchemy" />
|
||||
<item index="124" class="java.lang.String" itemvalue="wcwidth" />
|
||||
<item index="125" class="java.lang.String" itemvalue="importlib-metadata" />
|
||||
<item index="126" class="java.lang.String" itemvalue="Jinja2" />
|
||||
<item index="127" class="java.lang.String" itemvalue="simplegeneric" />
|
||||
<item index="128" class="java.lang.String" itemvalue="stomp.py" />
|
||||
<item index="129" class="java.lang.String" itemvalue="pywin32-ctypes" />
|
||||
<item index="130" class="java.lang.String" itemvalue="pyecharts" />
|
||||
<item index="131" class="java.lang.String" itemvalue="urllib3" />
|
||||
<item index="132" class="java.lang.String" itemvalue="Flask" />
|
||||
<item index="133" class="java.lang.String" itemvalue="coverage" />
|
||||
<item index="134" class="java.lang.String" itemvalue="pyinstaller" />
|
||||
<item index="135" class="java.lang.String" itemvalue="pymongo" />
|
||||
<item index="136" class="java.lang.String" itemvalue="six" />
|
||||
<item index="137" class="java.lang.String" itemvalue="parso" />
|
||||
<item index="138" class="java.lang.String" itemvalue="pytesseract" />
|
||||
<item index="139" class="java.lang.String" itemvalue="nbformat" />
|
||||
<item index="140" class="java.lang.String" itemvalue="ipython" />
|
||||
<item index="141" class="java.lang.String" itemvalue="jqdatasdk" />
|
||||
<item index="142" class="java.lang.String" itemvalue="python-rapidjson" />
|
||||
<item index="143" class="java.lang.String" itemvalue="packaging" />
|
||||
<item index="144" class="java.lang.String" itemvalue="pyecharts-javascripthon" />
|
||||
<item index="145" class="java.lang.String" itemvalue="prometheus-client" />
|
||||
<item index="146" class="java.lang.String" itemvalue="jupyter-echarts-pypkg" />
|
||||
<item index="147" class="java.lang.String" itemvalue="chardet" />
|
||||
<item index="148" class="java.lang.String" itemvalue="tqdm" />
|
||||
<item index="149" class="java.lang.String" itemvalue="thriftpy2" />
|
||||
<item index="150" class="java.lang.String" itemvalue="colorama" />
|
||||
<item index="151" class="java.lang.String" itemvalue="vnpy" />
|
||||
<item index="152" class="java.lang.String" itemvalue="ply" />
|
||||
<item index="153" class="java.lang.String" itemvalue="Flask-RESTful" />
|
||||
<item index="154" class="java.lang.String" itemvalue="openpyxl" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.11" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/zzb_data.iml" filepath="$PROJECT_DIR$/.idea/zzb_data.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,281 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="AutoImportSettings">
|
||||
<option name="autoReloadType" value="SELECTIVE" />
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/000593.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/000593.pdf" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/app_word.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/db_service_word.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/file/docx/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/main_word.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/test.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/word_title.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/zzb_logger.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.vscode/launch.json" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/config.py" beforeDir="false" afterPath="$PROJECT_DIR$/config.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/config_init.py" beforeDir="false" afterPath="$PROJECT_DIR$/config_init.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/db_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/db_service.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" beforeDir="false" afterPath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/file/docx/test.txt" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/parse_word/parse_word.py" beforeDir="false" afterPath="$PROJECT_DIR$/parse_word/parse_word.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/pdf_title.py" beforeDir="false" afterPath="$PROJECT_DIR$/pdf_title.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/redis_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/redis_service.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/test_process.py" beforeDir="false" afterPath="$PROJECT_DIR$/test_process.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FlaskConsoleOptions" custom-start-script="import sys sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS]) from flask.cli import ScriptInfo locals().update(ScriptInfo(create_app=None).load_app().make_shell_context()) print("Python %s on %s\nApp: %s [%s]\nInstance: %s" % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
|
||||
<envs>
|
||||
<env key="FLASK_APP" value="app" />
|
||||
</envs>
|
||||
<option name="myCustomStartScript" value="import sys sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS]) from flask.cli import ScriptInfo locals().update(ScriptInfo(create_app=None).load_app().make_shell_context()) print("Python %s on %s\nApp: %s [%s]\nInstance: %s" % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
|
||||
<option name="myEnvs">
|
||||
<map>
|
||||
<entry key="FLASK_APP" value="app" />
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="ProjectColorInfo">{
|
||||
"associatedIndex": 7
|
||||
}</component>
|
||||
<component name="ProjectId" id="2mTMc5iMC8X5mnsBHls6IKXwyDz" />
|
||||
<component name="ProjectViewState">
|
||||
<option name="hideEmptyMiddlePackages" value="true" />
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent">{
|
||||
"keyToString": {
|
||||
"Python.app.executor": "Run",
|
||||
"Python.app_word.executor": "Run",
|
||||
"Python.config_init.executor": "Run",
|
||||
"Python.db_service.executor": "Debug",
|
||||
"Python.db_service_word.executor": "Debug",
|
||||
"Python.main_word.executor": "Debug",
|
||||
"Python.parse_word.executor": "Run",
|
||||
"Python.pdf_title.executor": "Run",
|
||||
"Python.redis_service.executor": "Run",
|
||||
"Python.test.executor": "Run",
|
||||
"Python.test_process.executor": "Run",
|
||||
"Python.zzb_logger.executor": "Run",
|
||||
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"git-widget-placeholder": "pdf-0914(测试环境)",
|
||||
"ignore.virus.scanning.warn.message": "true",
|
||||
"last_opened_file_path": "C:/Users/45272/OneDrive/Documents/work/code/zzb_data_word/parse_word",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
}
|
||||
}</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word\parse_word" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\parse_word" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\file\docx" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.app_word">
|
||||
<configuration name="app_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="main_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/main_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="parse_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/parse_word" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/parse_word/parse_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="zzb_logger" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/zzb_logger.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.app_word" />
|
||||
<item itemvalue="Python.zzb_logger" />
|
||||
<item itemvalue="Python.test" />
|
||||
<item itemvalue="Python.parse_word" />
|
||||
<item itemvalue="Python.main_word" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="SharedIndexes">
|
||||
<attachedChunks>
|
||||
<set>
|
||||
<option value="bundled-python-sdk-5a2391486177-2887949eec09-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.13763.11" />
|
||||
</set>
|
||||
</attachedChunks>
|
||||
</component>
|
||||
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="" />
|
||||
<created>1727096188853</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1727096188853</updated>
|
||||
<workItem from="1727096189919" duration="4242000" />
|
||||
<workItem from="1727226275743" duration="13996000" />
|
||||
<workItem from="1727406650674" duration="9011000" />
|
||||
<workItem from="1727574308954" duration="28121000" />
|
||||
<workItem from="1728458648396" duration="3371000" />
|
||||
<workItem from="1728462140132" duration="473000" />
|
||||
<workItem from="1728462643998" duration="20841000" />
|
||||
<workItem from="1728544515382" duration="4091000" />
|
||||
<workItem from="1728557155319" duration="4071000" />
|
||||
<workItem from="1728607689751" duration="6705000" />
|
||||
<workItem from="1728868463278" duration="598000" />
|
||||
<workItem from="1728953453192" duration="2839000" />
|
||||
<workItem from="1728958252539" duration="12021000" />
|
||||
<workItem from="1729042469650" duration="17683000" />
|
||||
<workItem from="1729213219892" duration="15109000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/zzb_data$app.coverage" NAME="app Coverage Results" MODIFIED="1727226379705" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$main_word.coverage" NAME="main_word Coverage Results" MODIFIED="1728366719918" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$redis_service.coverage" NAME="redis_service Coverage Results" MODIFIED="1728537921801" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$test.coverage" NAME="test Coverage Results" MODIFIED="1729216810415" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$config_init.coverage" NAME="config_init Coverage Results" MODIFIED="1728540429755" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$test_process.coverage" NAME="test_process Coverage Results" MODIFIED="1728545660471" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$pdf_title.coverage" NAME="pdf_title Coverage Results" MODIFIED="1727243043393" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$db_service_word.coverage" NAME="db_service_word Coverage Results" MODIFIED="1727619004690" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$zzb_logger.coverage" NAME="zzb_logger Coverage Results" MODIFIED="1729237015669" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$db_service.coverage" NAME="db_service Coverage Results" MODIFIED="1727572268056" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1728569829164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1728569456711" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1729238946419" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1729064030098" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,298 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="AutoImportSettings">
|
||||
<option name="autoReloadType" value="SELECTIVE" />
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/000593.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/000593.pdf" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/101.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/app_word.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/db_service_word.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/file/docx/西部建设.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/file/docx/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/main_word.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/test.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/word_title.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/zzb_logger.py" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/西部建设.docx" afterDir="false" />
|
||||
<change afterPath="$PROJECT_DIR$/通威股份有限公司2023年第三季度报告.docx" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.vscode/launch.json" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/config.py" beforeDir="false" afterPath="$PROJECT_DIR$/config.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/config_init.py" beforeDir="false" afterPath="$PROJECT_DIR$/config_init.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/db_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/db_service.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" beforeDir="false" afterPath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/file/docx/test.txt" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/main.py" beforeDir="false" afterPath="$PROJECT_DIR$/main.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/parse_word/parse_word.py" beforeDir="false" afterPath="$PROJECT_DIR$/parse_word/parse_word.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/pdf_title.py" beforeDir="false" afterPath="$PROJECT_DIR$/pdf_title.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/redis_service.py" beforeDir="false" afterPath="$PROJECT_DIR$/redis_service.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/test_process.py" beforeDir="false" afterPath="$PROJECT_DIR$/test_process.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/utils.py" beforeDir="false" afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FlaskConsoleOptions" custom-start-script="import sys sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS]) from flask.cli import ScriptInfo locals().update(ScriptInfo(create_app=None).load_app().make_shell_context()) print("Python %s on %s\nApp: %s [%s]\nInstance: %s" % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))">
|
||||
<envs>
|
||||
<env key="FLASK_APP" value="app" />
|
||||
</envs>
|
||||
<option name="myCustomStartScript" value="import sys sys.path.extend([WORKING_DIR_AND_PYTHON_PATHS]) from flask.cli import ScriptInfo locals().update(ScriptInfo(create_app=None).load_app().make_shell_context()) print("Python %s on %s\nApp: %s [%s]\nInstance: %s" % (sys.version, sys.platform, app.import_name, app.env, app.instance_path))" />
|
||||
<option name="myEnvs">
|
||||
<map>
|
||||
<entry key="FLASK_APP" value="app" />
|
||||
</map>
|
||||
</option>
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="ProjectColorInfo">{
|
||||
"associatedIndex": 7
|
||||
}</component>
|
||||
<component name="ProjectId" id="2mTMc5iMC8X5mnsBHls6IKXwyDz" />
|
||||
<component name="ProjectViewState">
|
||||
<option name="hideEmptyMiddlePackages" value="true" />
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent">{
|
||||
"keyToString": {
|
||||
"Python.app.executor": "Run",
|
||||
"Python.app_word.executor": "Run",
|
||||
"Python.config_init.executor": "Run",
|
||||
"Python.db_service.executor": "Debug",
|
||||
"Python.db_service_word.executor": "Debug",
|
||||
"Python.main_word.executor": "Debug",
|
||||
"Python.parse_word.executor": "Run",
|
||||
"Python.pdf_title.executor": "Run",
|
||||
"Python.redis_service.executor": "Run",
|
||||
"Python.test.executor": "Run",
|
||||
"Python.test_process.executor": "Run",
|
||||
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"git-widget-placeholder": "pdf-0914(测试环境)",
|
||||
"ignore.virus.scanning.warn.message": "true",
|
||||
"last_opened_file_path": "C:/Users/45272/OneDrive/Documents/work/code/mars_2.0.1 - income",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
}
|
||||
}</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word\file\docx" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word\parse_word" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\parse_word" />
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data\file\docx" />
|
||||
</key>
|
||||
<key name="MoveFile.RECENT_KEYS">
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data_word" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.app_word">
|
||||
<configuration name="app_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="main_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/main_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="parse_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/parse_word" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/parse_word/parse_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="test_process" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test_process.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.app_word" />
|
||||
<item itemvalue="Python.test" />
|
||||
<item itemvalue="Python.parse_word" />
|
||||
<item itemvalue="Python.main_word" />
|
||||
<item itemvalue="Python.test_process" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="SharedIndexes">
|
||||
<attachedChunks>
|
||||
<set>
|
||||
<option value="bundled-python-sdk-5a2391486177-2887949eec09-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.13763.11" />
|
||||
</set>
|
||||
</attachedChunks>
|
||||
</component>
|
||||
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="" />
|
||||
<created>1727096188853</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1727096188853</updated>
|
||||
<workItem from="1727096189919" duration="4242000" />
|
||||
<workItem from="1727226275743" duration="13996000" />
|
||||
<workItem from="1727406650674" duration="9011000" />
|
||||
<workItem from="1727574308954" duration="28121000" />
|
||||
<workItem from="1728458648396" duration="3371000" />
|
||||
<workItem from="1728462140132" duration="473000" />
|
||||
<workItem from="1728462643998" duration="20841000" />
|
||||
<workItem from="1728544515382" duration="4091000" />
|
||||
<workItem from="1728557155319" duration="4071000" />
|
||||
<workItem from="1728607689751" duration="6705000" />
|
||||
<workItem from="1728868463278" duration="598000" />
|
||||
<workItem from="1728953453192" duration="2839000" />
|
||||
<workItem from="1728958252539" duration="12021000" />
|
||||
<workItem from="1729042469650" duration="17683000" />
|
||||
<workItem from="1729213219892" duration="9267000" />
|
||||
<workItem from="1729484773560" duration="4727000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<breakpoints>
|
||||
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
|
||||
<url>file://$PROJECT_DIR$/main_word.py</url>
|
||||
<line>87</line>
|
||||
<option name="timeStamp" value="8" />
|
||||
</line-breakpoint>
|
||||
</breakpoints>
|
||||
</breakpoint-manager>
|
||||
</component>
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/zzb_data$app.coverage" NAME="app Coverage Results" MODIFIED="1727226379705" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$main_word.coverage" NAME="main_word Coverage Results" MODIFIED="1728366719918" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$redis_service.coverage" NAME="redis_service Coverage Results" MODIFIED="1728537921801" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$test.coverage" NAME="test Coverage Results" MODIFIED="1729495516957" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$config_init.coverage" NAME="config_init Coverage Results" MODIFIED="1728540429755" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$test_process.coverage" NAME="test_process Coverage Results" MODIFIED="1728545660471" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$pdf_title.coverage" NAME="pdf_title Coverage Results" MODIFIED="1727243043393" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$db_service_word.coverage" NAME="db_service_word Coverage Results" MODIFIED="1727619004690" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$db_service.coverage" NAME="db_service Coverage Results" MODIFIED="1727572268056" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1728569829164" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1728569456711" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1729587252480" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data_word$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1729257457108" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,186 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="AutoImportSettings">
|
||||
<option name="autoReloadType" value="SELECTIVE" />
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="">
|
||||
<change afterPath="$PROJECT_DIR$/app_word.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/.vscode/launch.json" beforeDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" beforeDir="false" afterPath="$PROJECT_DIR$/file/docx/000593-2023-nb-nb.docx" afterDir="false" />
|
||||
<change beforePath="$PROJECT_DIR$/parse_word/parse_word.py" beforeDir="false" afterPath="$PROJECT_DIR$/parse_word/parse_word.py" afterDir="false" />
|
||||
</list>
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="Git.Settings">
|
||||
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="ProjectColorInfo">{
|
||||
"associatedIndex": 7
|
||||
}</component>
|
||||
<component name="ProjectId" id="2mTMc5iMC8X5mnsBHls6IKXwyDz" />
|
||||
<component name="ProjectViewState">
|
||||
<option name="hideEmptyMiddlePackages" value="true" />
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent">{
|
||||
"keyToString": {
|
||||
"Python.app.executor": "Run",
|
||||
"Python.app_word.executor": "Run",
|
||||
"Python.parse_word.executor": "Run",
|
||||
"Python.pdf_title.executor": "Run",
|
||||
"RunOnceActivity.OpenProjectViewOnStart": "true",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"git-widget-placeholder": "pdf-0914(测试环境)",
|
||||
"last_opened_file_path": "C:/Users/45272/OneDrive/Documents/work/code/zzb_data",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
}
|
||||
}</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="C:\Users\45272\OneDrive\Documents\work\code\zzb_data" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.app_word">
|
||||
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="app_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="parse_word" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/parse_word" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/parse_word/parse_word.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="pdf_title" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="zzb_data" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/pdf_title.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.app_word" />
|
||||
<item itemvalue="Python.parse_word" />
|
||||
<item itemvalue="Python.pdf_title" />
|
||||
<item itemvalue="Python.app" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="SharedIndexes">
|
||||
<attachedChunks>
|
||||
<set>
|
||||
<option value="bundled-js-predefined-1d06a55b98c1-91d5c284f522-JavaScript-PY-241.15989.155" />
|
||||
<option value="bundled-python-sdk-babbdf50b680-7c6932dee5e4-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.15989.155" />
|
||||
</set>
|
||||
</attachedChunks>
|
||||
</component>
|
||||
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="22ddc8e3-82b9-4724-8dc4-c1cf50311f22" name="Changes" comment="" />
|
||||
<created>1727096188853</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1727096188853</updated>
|
||||
<workItem from="1727096189919" duration="4242000" />
|
||||
<workItem from="1727226275743" duration="13996000" />
|
||||
<workItem from="1727406650674" duration="9304000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/zzb_data$parse_word.coverage" NAME="parse_word Coverage Results" MODIFIED="1727421672403" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/parse_word" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$app.coverage" NAME="app Coverage Results" MODIFIED="1727226379705" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$pdf_title.coverage" NAME="pdf_title Coverage Results" MODIFIED="1727243043393" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
<SUITE FILE_PATH="coverage/zzb_data$app_word.coverage" NAME="app_word Coverage Results" MODIFIED="1727422680153" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
|
||||
</component>
|
||||
</project>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,207 @@
|
|||
import pymssql
|
||||
import mysql.connector
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# SQL Server配置
|
||||
sql_server_config = {
|
||||
"server": "203.192.15.17", # SQL Server 的 IP 地址
|
||||
"port": 28063, # SQL Server 的端口
|
||||
"user": "zncbuser", # 用户名
|
||||
"password": "ZZB-Cbindex-data", # 密码
|
||||
"database": "jydb", # 数据库名称
|
||||
}
|
||||
|
||||
# MySQL配置
|
||||
mysql_config = {
|
||||
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com", # MySQL 的 IP 地址
|
||||
"user": "zzb_jydb", # 用户名
|
||||
"password": "Ysdbsdjs89Yrqwp", # 密码
|
||||
"database": "zzb_jydb", # 数据库名称
|
||||
}
|
||||
|
||||
# 分批大小(每次查询和插入的行数)
|
||||
BATCH_SIZE = 100000
|
||||
|
||||
# 最大进程数
|
||||
MAX_PROCESSES = 1
|
||||
|
||||
def sync_table(table_name):
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(
|
||||
server=sql_server_config["server"],
|
||||
port=sql_server_config["port"],
|
||||
user=sql_server_config["user"],
|
||||
password=sql_server_config["password"],
|
||||
database=sql_server_config["database"],
|
||||
)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 连接到MySQL
|
||||
mysql_conn = mysql.connector.connect(**mysql_config)
|
||||
mysql_cursor = mysql_conn.cursor()
|
||||
|
||||
logging.info(f"Processing table: {table_name}")
|
||||
|
||||
# 检查MySQL中是否已存在该表
|
||||
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
|
||||
table_exists = mysql_cursor.fetchone()
|
||||
|
||||
if not table_exists:
|
||||
# 如果表不存在,创建表
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT
|
||||
COLUMN_NAME,
|
||||
DATA_TYPE,
|
||||
CHARACTER_MAXIMUM_LENGTH,
|
||||
NUMERIC_PRECISION,
|
||||
NUMERIC_SCALE
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_NAME = '{table_name}'
|
||||
""")
|
||||
columns = sql_server_cursor.fetchall()
|
||||
|
||||
# 生成MySQL的CREATE TABLE语句
|
||||
create_table_sql = f"CREATE TABLE {table_name} ("
|
||||
for col in columns:
|
||||
col_name = col[0]
|
||||
col_type = col[1]
|
||||
|
||||
# 获取字段长度
|
||||
char_length = col[2]
|
||||
numeric_precision = col[3]
|
||||
numeric_scale = col[4]
|
||||
|
||||
# 简单类型映射(可能需要根据实际情况调整)
|
||||
if col_type == "varchar":
|
||||
col_type = "VARCHAR(255)"
|
||||
elif col_type == "int":
|
||||
col_type = "INT"
|
||||
elif col_type == "datetime":
|
||||
col_type = "DATETIME"
|
||||
elif col_type == "decimal":
|
||||
if numeric_precision and numeric_scale:
|
||||
col_type = f"DECIMAL({numeric_precision}, {numeric_scale})"
|
||||
else:
|
||||
col_type = "DECIMAL(10, 2)" # 默认值
|
||||
elif col_type == "money":
|
||||
col_type = "DECIMAL(19, 4)"
|
||||
|
||||
elif col_type == "smallmoney":
|
||||
col_type = "DECIMAL(19, 4)"
|
||||
elif col_type == "image":
|
||||
col_type = "LONGBLOB"
|
||||
|
||||
# 设置列的 NULL 属性
|
||||
if col_name.lower() == "id":
|
||||
# ID 列不允许 NULL
|
||||
create_table_sql += f"`{col_name}` {col_type} NOT NULL, "
|
||||
else:
|
||||
# 其他列允许 NULL
|
||||
create_table_sql += f"`{col_name}` {col_type} , "
|
||||
|
||||
# 添加主键约束(假设 ID 是主键)
|
||||
create_table_sql = create_table_sql.rstrip(", ") + f", PRIMARY KEY ({columns[0][0]}))"
|
||||
logging.info(f"Create table SQL: {create_table_sql}")
|
||||
|
||||
# 在MySQL中创建表
|
||||
mysql_cursor.execute(create_table_sql)
|
||||
logging.info(f"Table {table_name} created in MySQL.")
|
||||
else:
|
||||
logging.info(f"Table {table_name} already exists in MySQL. Updating data...")
|
||||
|
||||
# 获取表的列信息
|
||||
sql_server_cursor.execute(f"SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table_name}'")
|
||||
columns = sql_server_cursor.fetchall()
|
||||
|
||||
# 获取目标数据库中该表的 id 最大值
|
||||
mysql_cursor.execute(f"SELECT MAX({columns[0][0]}) FROM {table_name}")
|
||||
max_id = mysql_cursor.fetchone()[0]
|
||||
if max_id is None:
|
||||
max_id = 0 # 如果表中没有数据,设置 max_id 为 0
|
||||
logging.info(f"Target table {table_name} has max ID: {max_id}")
|
||||
|
||||
# 获取SQL Server中的数据(分批查询)
|
||||
offset = 0
|
||||
while True:
|
||||
# 使用 ROW_NUMBER() 实现分页查询
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT * FROM (
|
||||
SELECT *, ROW_NUMBER() OVER (ORDER BY {columns[0][0]}) AS RowNum
|
||||
FROM {table_name}
|
||||
WHERE {columns[0][0]} > {max_id}
|
||||
) AS SubQuery
|
||||
WHERE RowNum BETWEEN {offset + 1} AND {offset + BATCH_SIZE}
|
||||
""")
|
||||
rows = sql_server_cursor.fetchall()
|
||||
if not rows:
|
||||
logging.info(f"表:{table_name} 数据已经是最新的,不需要更新")
|
||||
break # 如果没有数据了,退出循环
|
||||
|
||||
insert_values = [row[:-1] for row in rows]
|
||||
|
||||
# 批量插入数据
|
||||
if insert_values:
|
||||
# 动态生成插入语句的列名和占位符
|
||||
placeholders = ", ".join(["%s"] * len(insert_values[0]))
|
||||
columns_list = ", ".join([col[0] for col in columns])
|
||||
insert_sql = f"INSERT INTO {table_name} ({columns_list}) VALUES ({placeholders})"
|
||||
# 执行批量插入
|
||||
try:
|
||||
mysql_cursor.executemany(insert_sql, insert_values)
|
||||
mysql_conn.commit()
|
||||
logging.info(f"Inserted {len(insert_values)} rows into {table_name}.")
|
||||
except mysql.connector.errors.DataError as e:
|
||||
logging.error(f"DataError: {e}")
|
||||
mysql_conn.rollback()
|
||||
|
||||
offset += BATCH_SIZE
|
||||
logging.info(f"Processed {offset} rows in {table_name}...")
|
||||
|
||||
# 关闭连接
|
||||
sql_server_cursor.close()
|
||||
sql_server_conn.close()
|
||||
mysql_cursor.close()
|
||||
mysql_conn.close()
|
||||
|
||||
logging.info(f"Sync completed for table: {table_name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to sync table {table_name}. Error: {e}")
|
||||
|
||||
def main():
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(
|
||||
server=sql_server_config["server"],
|
||||
port=sql_server_config["port"],
|
||||
user=sql_server_config["user"],
|
||||
password=sql_server_config["password"],
|
||||
database=sql_server_config["database"],
|
||||
)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 获取SQL Server中的所有表
|
||||
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
|
||||
tables = sql_server_cursor.fetchall()
|
||||
|
||||
# 使用进程池并发处理每个表
|
||||
with Pool(processes=MAX_PROCESSES) as pool:
|
||||
pool.map(sync_table, [table[0] for table in tables])
|
||||
|
||||
logging.info("All tables synced successfully!")
|
||||
except Exception as e:
|
||||
logging.error(f"Main function failed. Error: {e}")
|
||||
finally:
|
||||
# 关闭连接
|
||||
if 'sql_server_cursor' in locals():
|
||||
sql_server_cursor.close()
|
||||
if 'sql_server_conn' in locals():
|
||||
sql_server_conn.close()
|
||||
|
||||
# 启动主函数
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,674 @@
|
|||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
|
|
@ -0,0 +1,35 @@
|
|||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection ,utility
|
||||
#测试环境建好了 114.55.115.191:19530
|
||||
# 连接到 Milvus
|
||||
#connections.connect("default", host="114.55.115.191", port="19530")#本地测试
|
||||
#connections.connect("default", host="124.71.157.162", port="19530")#生产环境
|
||||
# connections.connect("default", host="124.70.129.232", port="19530")#测试环境
|
||||
#connections.connect("default", host="1.94.113.19", port="19530")#测试环境13
|
||||
connections.connect("default", host="1.94.60.103", port="19530")#测试环境103
|
||||
|
||||
|
||||
#connections.connect("default", host="192.168.0.129", port="19530")
|
||||
collections = utility.list_collections()
|
||||
#生产:1.94.179.121
|
||||
#测试:114.55.115.191
|
||||
# 输出集合列表
|
||||
print("Collections in Milvus:")
|
||||
for collection in collections:
|
||||
print(collection)
|
||||
|
||||
collection = Collection(name='pdf_measure_v4')
|
||||
|
||||
# 获取集合的详细信息
|
||||
#print(f"Collection name: {collection.name}")
|
||||
print(f"Collection schema: {collection.schema}")
|
||||
print(f"Number of entities in collection: {collection.num_entities}")
|
||||
search_vectors = collection.load()
|
||||
entities = collection.query(expr="file_id == '39369'", output_fields=["table_num","table_index","measure_name","measure_value","measure_unit","file_id"],limit = 10)
|
||||
count = 0
|
||||
# # # 输出集合中的数据
|
||||
print("Data in collection:")
|
||||
for entity in entities:
|
||||
print(entity)
|
||||
count += 1
|
||||
print(f'这个条件下数据量为{count}')
|
||||
connections.disconnect('default')
|
|
@ -0,0 +1,69 @@
|
|||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
|
||||
from config import MILVUS_CLIENT
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
def create_partition_by_hour(current_hour):
|
||||
# 连接到 Milvus 服务器
|
||||
connections.connect("default",uri=MILVUS_CLIENT)
|
||||
# 获取集合
|
||||
collection_name = "pdf_measure_v4"
|
||||
collection = Collection(collection_name)
|
||||
|
||||
# 创建当前小时的分区
|
||||
partition_name = f"partition_{current_hour}"
|
||||
if not collection.has_partition(partition_name):
|
||||
collection.create_partition(partition_name)
|
||||
print(f"Created partition: {partition_name}")
|
||||
partition = collection.partition(partition_name)
|
||||
partition.load()
|
||||
|
||||
# 获取所有分区
|
||||
partitions = collection.partitions
|
||||
# 删除所有分区(除了默认分区和当前分区)
|
||||
for partition in partitions:
|
||||
name = partition.name
|
||||
if name not in ["_default", partition_name]: # 保留默认分区
|
||||
pre_partition = collection.partition(name)
|
||||
pre_partition.release()
|
||||
collection.drop_partition(name)
|
||||
print(f"Partition '{name}' deleted.")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
|
||||
# 连接到 B 服务器上的 Milvus
|
||||
# connections.connect(host='124.70.129.232', port='19530')# 测试服务器
|
||||
connections.connect(host='127.0.0.1', port='19530')# 测试服务器
|
||||
# # 获取集合列表
|
||||
utility.drop_collection("pdf_measure_v4")
|
||||
|
||||
# 定义字段
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
|
||||
FieldSchema(name="table_num", dtype=DataType.INT16),
|
||||
FieldSchema(name="table_index", dtype=DataType.INT16),
|
||||
FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
|
||||
]
|
||||
|
||||
# 定义集合的 schema
|
||||
schema = CollectionSchema(fields=fields, description="My Milvus collection")
|
||||
|
||||
# 创建集合
|
||||
collection = Collection(name="pdf_measure_v4", schema=schema)
|
||||
|
||||
collection = Collection("pdf_measure_v4")
|
||||
index_params = {
|
||||
"index_type": "IVF_FLAT",
|
||||
"metric_type": "COSINE",
|
||||
"params": {"nlist": 128}
|
||||
}
|
||||
collection.create_index(field_name="vector", index_params=index_params)
|
||||
collection.load()
|
|
@ -0,0 +1,366 @@
|
|||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
import os
|
||||
import utils
|
||||
import queue
|
||||
from multiprocessing import Process,Manager
|
||||
import pdf_title
|
||||
import main
|
||||
import time
|
||||
|
||||
import config
|
||||
import requests
|
||||
import db_service
|
||||
import threading
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
cpu_count = os.cpu_count()
|
||||
job_queue = queue.Queue()
|
||||
|
||||
# 定义请求体模型
|
||||
class FileItem(BaseModel):
|
||||
file_path: str
|
||||
file_id: str
|
||||
|
||||
def run_job():
|
||||
#判断是否有任务在执行
|
||||
if_run = True
|
||||
|
||||
if job_queue.empty():
|
||||
print(f"job_queue为空:")
|
||||
if_run = False
|
||||
|
||||
if if_run:
|
||||
job_config = job_queue.get()
|
||||
page_list = []
|
||||
file_path = job_config['file_path']
|
||||
file_id = job_config['file_id']
|
||||
job_status = True
|
||||
continue_execution = True
|
||||
try:
|
||||
#下载pdf
|
||||
start_time = time.time()
|
||||
print(f"开始启动文件解析任务: {file_path}")
|
||||
if file_path.startswith('http'):
|
||||
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
|
||||
try:
|
||||
file_info = pdf_title.create_text_outline(file_path,file_id)
|
||||
except Exception as e:
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
continue_execution = False
|
||||
if continue_execution:
|
||||
print(cpu_count)
|
||||
parent_table_pages = file_info['parent_table_pages']
|
||||
print('parent_table_pages的值是')
|
||||
print(parent_table_pages)
|
||||
|
||||
# page_nums = [
|
||||
# '1-3',
|
||||
# '4-6',
|
||||
# ]
|
||||
page_num = file_info['page_count']
|
||||
if page_num < cpu_count:
|
||||
p_count = page_num
|
||||
else :
|
||||
p_count = cpu_count
|
||||
|
||||
for i in range(p_count):
|
||||
# for i in range(2):
|
||||
page_list.append({
|
||||
'type': 'table',
|
||||
'page_num': file_info['split_parts']['table_split_parts'][i],
|
||||
# 'page_num': page_nums[i],
|
||||
'path': file_path,
|
||||
'file_id': file_id,
|
||||
'parent_table_pages': parent_table_pages,
|
||||
'page_count': file_info['page_count'],
|
||||
'tables_range': {},
|
||||
})
|
||||
|
||||
|
||||
# 通知开始解析
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
|
||||
print(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
print(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
parser_start_time = time.time()
|
||||
processes = []
|
||||
time_dispatch_job = time.time()
|
||||
for job_info in page_list:
|
||||
p = Process(target=main.dispatch_job, args=(job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
#time_dispatch_job_end = time.time()
|
||||
#process_time = time_dispatch_job_end - time_dispatch_job
|
||||
#db_service.process_time(file_id,'1',process_time)
|
||||
|
||||
print('等待所有子任务完成,任务ID:', file_id)
|
||||
for p in processes:
|
||||
p.join()
|
||||
print('pdf解析任务完成任务完成,任务ID:', file_id)
|
||||
time_dispatch_job_end = time.time()
|
||||
process_time = time_dispatch_job_end - time_dispatch_job
|
||||
db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
|
||||
parser_end_time = time.time()
|
||||
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
#这里做一步判断,看看是否还要继续。
|
||||
if db_service.file_type_check(file_id):
|
||||
print("文本较真表格生成已结束")
|
||||
else:
|
||||
# 通知抽取指标
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
|
||||
print(f'通知开始抽取指标url:{file_id}:{response.url}')
|
||||
print(f'通知开始抽取指标状态:{file_id}:{response.text}')
|
||||
|
||||
parser_start_time = time.time()
|
||||
print('开始表格指标抽取,任务ID:', file_id)
|
||||
time_start = time.time()
|
||||
if db_service.file_type_check_v2(file_id) ==3:#判断是否为3季报
|
||||
main.start_table_measure_job(file_id)
|
||||
#time_start_end = time.time()
|
||||
#process_time = time_start_end - time_start
|
||||
#db_service.process_time(file_id,'2',process_time)
|
||||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
print('表格指标抽取完成,任务ID:', file_id)
|
||||
parser_end_time = time.time()
|
||||
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
print('启动这个指标归一化任务ID-修改测试:', file_id)
|
||||
time_update = time.time()
|
||||
main.update_measure_data(file_id,file_path,parent_table_pages)
|
||||
#time_update_end = time.time()
|
||||
#process_time = time_update_end - time_update
|
||||
#db_service.process_time(file_id,'3',process_time)
|
||||
print('归一化完成任务ID:', file_id)
|
||||
end_time = time.time()
|
||||
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
else:#不是三季报就直接按照年报和半年报走
|
||||
main.start_table_measure_job(file_id)
|
||||
#time_start_end = time.time()
|
||||
#process_time = time_start_end - time_start
|
||||
#db_service.process_time(file_id,'2',process_time)
|
||||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
print('表格指标抽取完成,任务ID:', file_id)
|
||||
parser_end_time = time.time()
|
||||
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
print('启动这个指标归一化任务ID-修改测试:', file_id)
|
||||
time_update = time.time()
|
||||
main.update_measure_data(file_id,file_path,parent_table_pages)
|
||||
#time_update_end = time.time()
|
||||
#process_time = time_update_end - time_update
|
||||
#db_service.process_time(file_id,'3',process_time)
|
||||
print('归一化完成任务ID:', file_id)
|
||||
end_time = time.time()
|
||||
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
except Exception as e:
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
if "integer division or modulo by zero" in str(e):
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id, 'status': 4})
|
||||
else:
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id, 'status': 4})
|
||||
#response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"Response status code: {response.status_code}")
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
finally:
|
||||
print(f"任务 {file_id} 完成,运行状态:{job_status}")
|
||||
#pdf_company_0824.name_code_fix(file_id,file_path)
|
||||
#print('公司名与编码填充完毕')
|
||||
else:
|
||||
print("有任务运行中,需要等待.....")
|
||||
|
||||
def parse_pdf_route(fileItem: FileItem):
|
||||
|
||||
# 创建一个队列,保证每次只执行一个文件解析任务
|
||||
job_queue.put({
|
||||
'file_path' : fileItem.file_path,
|
||||
'file_id' : fileItem.file_id
|
||||
})
|
||||
print(f"增加 {fileItem.file_id} 到队列.")
|
||||
|
||||
threading.Thread(target=run_job, args=()).start()
|
||||
|
||||
return {"success": True, "msg": "文件解析开始"}
|
||||
|
||||
app.post("/parser/start",
|
||||
tags=["parser"],
|
||||
summary="解析Pdf文件",
|
||||
)(parse_pdf_route)
|
||||
|
||||
def run_disclosure():
|
||||
#判断是否有任务在执行
|
||||
if_run = True
|
||||
|
||||
if job_queue.empty():
|
||||
print(f"job_queue为空")
|
||||
if_run = False
|
||||
|
||||
if if_run:
|
||||
job_config = job_queue.get()
|
||||
page_list = []
|
||||
file_path = job_config['file_path']
|
||||
file_id = job_config['file_id']
|
||||
job_status = True
|
||||
continue_execution = True
|
||||
try:
|
||||
#下载pdf
|
||||
start_time = time.time()
|
||||
print(f"开始启动文件解析任务: {file_path}")
|
||||
print('这里是信披')
|
||||
if file_path.startswith('http'):
|
||||
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
|
||||
try:
|
||||
file_info = pdf_title.create_text_outline_disclosure(file_path,file_id)
|
||||
except Exception as e:
|
||||
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id,'status': 7})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
continue_execution = False
|
||||
if continue_execution:
|
||||
print(cpu_count)
|
||||
parent_table_pages = file_info['parent_table_pages']
|
||||
print('parent_table_pages的值是')
|
||||
print(parent_table_pages)
|
||||
|
||||
# page_nums = [
|
||||
# '1-3',
|
||||
# '4-6',
|
||||
# ]
|
||||
print(cpu_count)
|
||||
print('测试')
|
||||
page_num = file_info['page_count']
|
||||
if page_num < cpu_count:
|
||||
p_count = page_num
|
||||
else :
|
||||
p_count = cpu_count
|
||||
|
||||
for i in range(p_count):
|
||||
# for i in range(2):
|
||||
page_list.append({
|
||||
'type': 'table',
|
||||
'page_num': file_info['split_parts']['table_split_parts'][i],
|
||||
# 'page_num': page_nums[i],
|
||||
'path': file_path,
|
||||
'file_id': file_id,
|
||||
'parent_table_pages': parent_table_pages,
|
||||
'page_count': file_info['page_count'],
|
||||
'tables_range': {},
|
||||
})
|
||||
|
||||
|
||||
# 通知开始解析
|
||||
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id,'status': 5})
|
||||
print(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
print(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
parser_start_time = time.time()
|
||||
processes = []
|
||||
time_dispatch_job = time.time()
|
||||
for job_info in page_list:
|
||||
p = Process(target=main.dispatch_disclosure, args=(job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
#time_dispatch_job_end = time.time()
|
||||
#process_time = time_dispatch_job_end - time_dispatch_job
|
||||
#db_service.process_time(file_id,'1',process_time)
|
||||
|
||||
print('等待所有子任务完成,任务ID:', file_id)
|
||||
for p in processes:
|
||||
p.join()
|
||||
print('pdf解析任务完成任务完成,任务ID:', file_id)
|
||||
time_dispatch_job_end = time.time()
|
||||
process_time = time_dispatch_job_end - time_dispatch_job
|
||||
#db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
|
||||
parser_end_time = time.time()
|
||||
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
#这里做一步判断,看看是否还要继续。
|
||||
#if db_service.file_type_check(file_id):
|
||||
print("文本较真表格生成已结束")
|
||||
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id,'status': 1})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
#db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
except Exception as e:
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
if "integer division or modulo by zero" in str(e):
|
||||
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id, 'status': 4})
|
||||
else:
|
||||
response = requests.get(config.NOTIFY_ADDR_DIS, params={'fileId': file_id, 'status': 4})
|
||||
#response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
#db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"Response status code: {response.status_code}")
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
finally:
|
||||
print(f"任务 {file_id} 完成,运行状态:{job_status}")
|
||||
#pdf_company_0824.name_code_fix(file_id,file_path)
|
||||
#print('公司名与编码填充完毕')
|
||||
else:
|
||||
print("有任务运行中,需要等待.....")
|
||||
#信披文件解析
|
||||
def disclosure(fileItem: FileItem):
|
||||
|
||||
# 创建一个队列,保证每次只执行一个文件解析任务
|
||||
job_queue.put({
|
||||
'file_path' : fileItem.file_path,
|
||||
'file_id' : fileItem.file_id
|
||||
})
|
||||
print(f"增加 {fileItem.file_id} 到队列.")
|
||||
|
||||
threading.Thread(target=run_disclosure, args=()).start()
|
||||
|
||||
return {"success": True, "msg": "文件解析开始"}
|
||||
app.post("/parser/disclosure",
|
||||
tags=["parser"],
|
||||
summary="信披文件解析",
|
||||
)(disclosure)
|
||||
|
||||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
# 服务器启动服务
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
|
||||
# 本地调试任务
|
||||
#job_queue.put({
|
||||
#'file_path' : '6281.pdf',
|
||||
#'file_id' : '6281'
|
||||
#})
|
||||
|
||||
#run_job()
|
|
@ -0,0 +1,226 @@
|
|||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
import os
|
||||
import utils
|
||||
import queue
|
||||
import multiprocessing
|
||||
from multiprocessing import Process
|
||||
import word_title
|
||||
import time
|
||||
import config
|
||||
import requests
|
||||
import threading
|
||||
from parse_word import parse_docx, split_text_table
|
||||
import json
|
||||
import db_service_word
|
||||
import main_word
|
||||
from zzb_logger import applog
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
cpu_count = os.cpu_count()
|
||||
job_queue = queue.Queue()
|
||||
|
||||
# 定义请求体模型
|
||||
class FileItem(BaseModel):
|
||||
file_path: str
|
||||
file_id: str
|
||||
|
||||
def split_list(lst, n):
|
||||
k, m = divmod(len(lst), n)
|
||||
return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]
|
||||
|
||||
def run_job():
|
||||
#判断是否有任务在执行
|
||||
if_run = True
|
||||
|
||||
if job_queue.empty():
|
||||
applog.info(f"job_queue为空:")
|
||||
if_run = False
|
||||
|
||||
if if_run:
|
||||
job_config = job_queue.get()
|
||||
file_path = job_config['file_path']
|
||||
file_id = job_config['file_id']
|
||||
continue_execution = True
|
||||
try:
|
||||
|
||||
start_time = time.time()
|
||||
applog.info(f"开始启动文件解析任务: {file_path}")
|
||||
if file_path.startswith('http'):
|
||||
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
|
||||
try:
|
||||
time_dispatch_job = time.time()
|
||||
# 通知开始解析 暂时不通知
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
|
||||
applog.info(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
applog.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
parsed_content, catalog_content = parse_docx(file_path) # catalog_content 目录需要写入数据库
|
||||
|
||||
json_parsed_content = json.loads(parsed_content)
|
||||
json_catalog_content = json.loads(catalog_content)
|
||||
|
||||
db_service_word.word_title_insert_mysql(file_id, json_catalog_content)
|
||||
|
||||
parent_table_pages = word_title.get_parent_table_pages(json_catalog_content,file_id)
|
||||
|
||||
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
|
||||
#
|
||||
processes = []
|
||||
text_list = split_list(json.loads(text_elements_json), cpu_count)
|
||||
applog.info(f'text,任务ID:{file_id}')
|
||||
for job_info in text_list:
|
||||
p = Process(target=main_word.process_text_content, args=(file_id, job_info,json.loads(table_elements_json),json.loads(text_elements_json)))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
applog.info(f'等待所有子任务完成,任务ID:{file_id}')
|
||||
for p in processes:
|
||||
p.join()
|
||||
applog.info(f'word表格中 text解析完成,任务ID:{file_id}',)
|
||||
|
||||
processes = []
|
||||
table_list = split_list(json.loads(table_elements_json), cpu_count)
|
||||
applog.info(f'开始解析word表表格中的table,任务ID:{file_id}')
|
||||
for job_info in table_list:
|
||||
p = Process(target=main_word.process_table, args=(file_id, job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
applog.info(f'等待所有子任务完成,任务ID:{file_id}')
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
# main_word.process_table(file_id, json.loads(table_elements_json))
|
||||
applog.info(f'word表格中 table解析完成,任务ID:{file_id}')
|
||||
|
||||
|
||||
time_dispatch_job_end = time.time()
|
||||
process_time = time_dispatch_job_end - time_dispatch_job
|
||||
db_service_word.process_time(file_id, '1', process_time, time_dispatch_job, time_dispatch_job_end)
|
||||
parser_end_time = time.time()
|
||||
applog.info(f"解析任务 {file_id} 完成,耗时{(parser_end_time - time_dispatch_job):.2f} 秒。")
|
||||
|
||||
except Exception as e:
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
|
||||
applog.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
applog.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
applog.info(f"{file_id}运行失败: {e}")
|
||||
continue_execution = False
|
||||
if continue_execution :
|
||||
#这里做一步判断,看看是否还要继续。
|
||||
if db_service_word.file_type_check(file_id):
|
||||
applog.info("文本较真表格生成已结束")
|
||||
else:
|
||||
# 通知抽取指标---------------------------------
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
|
||||
applog.info(f'通知开始抽取指标url:{file_id}:{response.url}')
|
||||
applog.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
|
||||
|
||||
parser_start_time = time.time()
|
||||
applog.info(f'开始表格指标抽取,任务ID:{file_id}')
|
||||
time_start = time.time()
|
||||
if db_service_word.file_type_check_v2(file_id) == 3 : #判断是否为3季报
|
||||
main_word.start_table_measure_job(file_id)
|
||||
#time_start_end = time.time()
|
||||
#process_time = time_start_end - time_start
|
||||
#db_service.process_time(file_id,'2',process_time)
|
||||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
applog.info(f'表格指标抽取完成,任务ID:{file_id}')
|
||||
parser_end_time = time.time()
|
||||
applog.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
|
||||
time_update = time.time()
|
||||
main_word.update_measure_data(file_id,file_path,parent_table_pages)
|
||||
#time_update_end = time.time()
|
||||
#process_time = time_update_end - time_update
|
||||
#db_service.process_time(file_id,'3',process_time)
|
||||
applog.info(f'归一化完成任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
applog.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
else:#不是三季报就直接按照年报和半年报走
|
||||
main_word.start_table_measure_job(file_id)
|
||||
#time_start_end = time.time()
|
||||
#process_time = time_start_end - time_start
|
||||
#db_service.process_time(file_id,'2',process_time)
|
||||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service_word.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
applog.info(f'表格指标抽取完成,任务ID:{file_id}' )
|
||||
parser_end_time = time.time()
|
||||
applog.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
applog.info(f'启动这个指标归一化任务ID-修改测试:{file_id}' )
|
||||
time_update = time.time()
|
||||
main_word.update_measure_data(file_id,file_path,parent_table_pages)
|
||||
#time_update_end = time.time()
|
||||
#process_time = time_update_end - time_update
|
||||
#db_service.process_time(file_id,'3',process_time)
|
||||
applog.info(f'归一化完成任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
applog.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service_word.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
|
||||
applog.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
applog.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
except Exception as e:
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 4})
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service_word.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
applog.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
applog.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
applog.info(f"Response status code: {response.status_code}")
|
||||
applog.info(f"{file_id}运行失败: {e}")
|
||||
finally:
|
||||
applog.info(f"任务 {file_id} 完成")
|
||||
|
||||
else:
|
||||
applog.info("有任务运行中,需要等待.....")
|
||||
|
||||
def parse_route(fileItem: FileItem):
|
||||
# 创建一个队列,保证每次只执行一个文件解析任务
|
||||
job_queue.put({
|
||||
'file_path' : fileItem.file_path,
|
||||
'file_id' : fileItem.file_id,
|
||||
# 'type': fileItem.type
|
||||
})
|
||||
applog.info(f"增加 {fileItem.file_id} 到队列.")
|
||||
threading.Thread(target=run_job, args=()).start()
|
||||
|
||||
return {"success": True, "msg": "文件解析开始"}
|
||||
|
||||
app.post("/parser/start",
|
||||
tags=["parser"],
|
||||
summary="解析Pdf文件",
|
||||
)(parse_route)
|
||||
|
||||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
# 服务器启动服务
|
||||
# import uvicorn
|
||||
#
|
||||
# uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
# 本地调试任务
|
||||
file_id = "201917"
|
||||
job_queue.put({
|
||||
'file_path': '1.docx',
|
||||
'file_id': file_id,
|
||||
})
|
||||
db_service_word.delete_database(file_id)
|
||||
run_job()
|
|
@ -0,0 +1,251 @@
|
|||
import camelot
|
||||
import time
|
||||
import re
|
||||
import numpy as np
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextBoxHorizontal
|
||||
import pdfplumber
|
||||
import json
|
||||
import utils
|
||||
|
||||
def chunks(l, n):
|
||||
"""Yield successive n-sized chunks from l."""
|
||||
for i in range(0, len(l), n):
|
||||
yield l[i : i + n]
|
||||
|
||||
|
||||
def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
|
||||
"""
|
||||
Divide the extraction work into n chunks. At the end of every chunk,
|
||||
save data on disk and free RAM.
|
||||
|
||||
filepath : str
|
||||
Filepath or URL of the PDF file.
|
||||
pages : str, optional (default: '1')
|
||||
Comma-separated page numbers.
|
||||
Example: '1,3,4' or '1,4-end' or 'all'.
|
||||
"""
|
||||
|
||||
# get list of pages from camelot.handlers.PDFHandler
|
||||
handler = camelot.handlers.PDFHandler(filepath)
|
||||
page_list = handler._get_pages(pages=pages_num)
|
||||
|
||||
# chunk pages list
|
||||
page_chunks = list(chunks(page_list, chunk_num))
|
||||
|
||||
# extraction and export
|
||||
for chunk in page_chunks:
|
||||
pages_string = str(chunk).replace("[", "").replace("]", "")
|
||||
tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
|
||||
tables.export(f"{export_path}/tables.csv")
|
||||
|
||||
# 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
||||
def get_pdf_info(file_path, pages):
|
||||
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
|
||||
|
||||
pdf_info = []
|
||||
tables_range = {}
|
||||
|
||||
for table_num, t in enumerate(tables):
|
||||
|
||||
top = t._bbox[3]
|
||||
buttom = t._bbox[1]
|
||||
page_num = int(t.page)
|
||||
table_index = int(t.order)
|
||||
arr = np.array(t.data)
|
||||
if not tables_range.get(page_num):
|
||||
tables_range[page_num] = []
|
||||
|
||||
tables_range[page_num].append({
|
||||
'top' : top,
|
||||
'buttom' : buttom,
|
||||
'table_index' : table_index,
|
||||
'page_num' : page_num,
|
||||
})
|
||||
|
||||
pdf_info.append({
|
||||
'top' : top,
|
||||
'buttom' : buttom,
|
||||
'page_num' : page_num,
|
||||
'table_index' : table_index,
|
||||
"type" : "table",
|
||||
"data" : t.data,
|
||||
'sort_num' : page_num*1000 - top
|
||||
})
|
||||
|
||||
for pagenum, page in enumerate(extract_pages(file_path)):
|
||||
page_elements = [(element.y1, element) for element in page._objs]
|
||||
# 查找组成页面的元素
|
||||
for i,component in enumerate(page_elements):
|
||||
|
||||
text_type = 'text'
|
||||
# 提取页面布局的元素
|
||||
element = component[1]
|
||||
# 检查该元素是否为文本元素
|
||||
if isinstance(element, LTTextBoxHorizontal):
|
||||
# 检查文本是否出现在表中
|
||||
line_text = element.get_text().replace('\n','')
|
||||
line_text = re.sub(r"\s", "", line_text)
|
||||
|
||||
element_top = element.bbox[3]
|
||||
element_buttom = element.bbox[1]
|
||||
|
||||
# 检查该文本是否出现在表中
|
||||
if tables_range.get(pagenum+1):
|
||||
for range in tables_range[pagenum+1]:
|
||||
# print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
|
||||
if element_top < range['top'] and element_top > range['buttom']:
|
||||
pass
|
||||
else:
|
||||
if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
|
||||
if i == 0:
|
||||
text_type = get_text_type(line_text)
|
||||
if text_type == 'page_header':
|
||||
break
|
||||
if utils.check_table_title_black_list(line_text):
|
||||
print(line_text)
|
||||
|
||||
pdf_info.append({
|
||||
'top' : element_top,
|
||||
'buttom' : element_buttom,
|
||||
'page_num' : range['page_num'],
|
||||
'table_index' : range['table_index'],
|
||||
"type" : text_type,
|
||||
'content' : line_text,
|
||||
'sort_num' : range['page_num']*1000 - element_top
|
||||
})
|
||||
break
|
||||
#处理母公司表格标题在页面底部,完整表格在下一页
|
||||
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
|
||||
text_type = get_text_type(line_text)
|
||||
|
||||
if text_type == 'page_footer':
|
||||
continue
|
||||
|
||||
pdf_info.append({
|
||||
'top' : element_top,
|
||||
'buttom' : element_buttom,
|
||||
'page_num' : pagenum+1,
|
||||
"type" : text_type,
|
||||
'content' : line_text,
|
||||
'sort_num' : (pagenum+1)*1000 - element_top
|
||||
})
|
||||
# print(f'{element_top}: {element_buttom}: {line_text}')
|
||||
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
||||
for info in sorted_pdf_info:
|
||||
print(info)
|
||||
|
||||
|
||||
|
||||
def text_in_table(top, tables_range, page_num):
|
||||
if tables_range.get(page_num):
|
||||
for range in tables_range[page_num]:
|
||||
if top < range['top'] and top > range['buttom']:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_text_type(text: str):
|
||||
first_re = '年度报告'
|
||||
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
||||
|
||||
if re.search(first_re, text.strip()):
|
||||
return 'page_header'
|
||||
|
||||
if page_number_pattern.match(text.strip()):
|
||||
return 'page_footer'
|
||||
|
||||
return 'text'
|
||||
|
||||
def find_continuous_numbers(numbers):
|
||||
# 首先对数组进行排序
|
||||
numbers.sort()
|
||||
|
||||
# 初始化新数组和连续序列的开始索引
|
||||
new_numbers = []
|
||||
start_index = 0
|
||||
|
||||
# 遍历排序后的数组
|
||||
for i in range(1, len(numbers)):
|
||||
# 检查当前数字是否与前一个数字不连续
|
||||
if numbers[i] != numbers[i-1] + 1:
|
||||
# 如果当前数字与前一个数字不连续,处理连续序列
|
||||
if i - start_index > 1:
|
||||
# 如果连续序列长度大于1,将最小数和最大数用"-"连接
|
||||
new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
|
||||
else:
|
||||
# 如果连续序列长度为1,直接添加数字
|
||||
new_numbers.append(str(numbers[start_index]))
|
||||
if start_index == i - 1:
|
||||
new_numbers.append(str(numbers[i-1]))
|
||||
|
||||
# 更新连续序列的开始索引
|
||||
start_index = i
|
||||
|
||||
# 处理数组末尾的连续序列
|
||||
if len(numbers) - start_index > 1:
|
||||
new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
|
||||
else:
|
||||
new_numbers.append(str(numbers[start_index]))
|
||||
if start_index < len(numbers) - 1:
|
||||
new_numbers.append(str(numbers[-1]))
|
||||
|
||||
return new_numbers
|
||||
|
||||
def merge_consecutive_arrays(file_path):
|
||||
merged_objects = []
|
||||
temp_array = {}
|
||||
|
||||
# 打开文件并读取每一行
|
||||
with open(file_path, 'r') as file:
|
||||
for line in file:
|
||||
# 去除行尾的换行符
|
||||
line = line.strip()
|
||||
# 尝试将行转换成JSON格式
|
||||
try:
|
||||
obj = eval(line)
|
||||
if obj['type'] == 'table':
|
||||
# 如果对象是数组,将其元素添加到临时列表中
|
||||
if not temp_array.get('page_num'):
|
||||
temp_array = obj
|
||||
else:
|
||||
temp_array['data'].extend(obj['data'])
|
||||
else:
|
||||
# 如果对象不是数组,检查临时列表是否为空
|
||||
if temp_array:
|
||||
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON line: {e}")
|
||||
|
||||
if temp_array:
|
||||
merged_objects.append(temp_array)
|
||||
|
||||
# 关闭文件
|
||||
file.close()
|
||||
|
||||
return merged_objects
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# print(get_text_type('6/223 '.strip()))
|
||||
# start = time.time()
|
||||
get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
|
||||
# end = time.time()
|
||||
# print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
|
||||
# 示例数组
|
||||
# numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
|
||||
# # 调用函数并打印结果
|
||||
# print(find_continuous_numbers(numbers))
|
||||
# 示例数组对象列表
|
||||
# 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下:
|
||||
# df1 = tables[0].df
|
||||
# df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引
|
||||
# df3 = tables[1].df
|
||||
# df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0])
|
||||
# df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并,ignore_index=True,表根据列名对齐合并,生成新的index
|
||||
# print(df__2)
|
||||
|
||||
# 调用函数并打印结果
|
||||
# print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))
|
|
@ -0,0 +1,9 @@
|
|||
{"auto_id": true, "description": "", "fields":
|
||||
[{"name": "pk", "description": "", "type": 5, "is_primary": true, "auto_id": true},
|
||||
{"name": "vector", "description": "", "type": 101, "params": {"dim": 1536}},
|
||||
{"name": "table_num", "description": "", "type": 3},
|
||||
{"name": "table_index", "description": "", "type": 3},
|
||||
{"name": "measure_name", "description": "", "type": 21, "params": {"max_length": 304}},
|
||||
{"name": "measure_value", "description": "", "type": 21, "params": {"max_length": 100}},
|
||||
{"name": "file_id", "description": "", "type": 21, "params": {"max_length": 50}},
|
||||
{"name": "measure_unit", "description": "", "type": 21, "params": {"max_length": 50}}]}
|
|
@ -0,0 +1,33 @@
|
|||
MILVUS_CLIENT='http://124.70.129.232:19530'
|
||||
#MILVUS_CLIENT='http://60.204.228.154:19530'
|
||||
MYSQL_HOST = '121.37.185.246'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial'
|
||||
MYSQL_PASSWORD = 'financial_8000'
|
||||
MYSQL_DB = 'financial_report'
|
||||
|
||||
# NOTIFY_ADDR = 'http://192.168.0.175:8100/api/tenant/report/notify'
|
||||
|
||||
|
||||
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
|
||||
|
||||
# REDIS_HOST = '127.0.0.1'
|
||||
REDIS_HOST = '123.60.153.169'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
FILE_PATH = '/root/word_parser/word/'
|
||||
PORT = 8001
|
||||
MEASURE_COUNT = 8
|
||||
|
||||
# MYSQL_HOST_APP = '192.168.0.201'#192.168.0.201
|
||||
# MYSQL_PORT_APP = 3306
|
||||
# MYSQL_USER_APP = 'root'
|
||||
# MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
# MYSQL_DB_APP = 'financial_report_prod'
|
||||
|
||||
|
||||
MYSQL_HOST_APP = '121.37.185.246'#192.168.0.201
|
||||
MYSQL_PORT_APP = 3306
|
||||
MYSQL_USER_APP = 'financial'
|
||||
MYSQL_PASSWORD_APP = 'financial_8000'
|
||||
MYSQL_DB_APP = 'financial_report'
|
|
@ -0,0 +1,260 @@
|
|||
#coding=utf-8
|
||||
import sys,ast
|
||||
# from pdfminer.high_level import extract_text
|
||||
# from pdfminer.pdfparser import PDFParser
|
||||
# from pdfminer.pdfdocument import PDFDocument
|
||||
# from pdfminer.pdfpage import PDFPage
|
||||
import utils
|
||||
import mysql.connector
|
||||
# from pymilvus import connections,MilvusClient
|
||||
import json,time
|
||||
# import db_service
|
||||
import ast
|
||||
import numpy as np
|
||||
import config_p
|
||||
import redis_service
|
||||
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
# import main
|
||||
import redis
|
||||
|
||||
def run_job(sec):
|
||||
time.sleep(sec)
|
||||
|
||||
def measure_config_to_db(conn,cursor):
|
||||
insert_query = '''
|
||||
INSERT INTO measure_config_half_year
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
'''
|
||||
# 打开文本文件
|
||||
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
|
||||
# 读取所有行到一个列表中
|
||||
lines = file.readlines()
|
||||
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor):
|
||||
|
||||
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
|
||||
'''
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
|
||||
# def contains_financial_indicators(text):
|
||||
# import re
|
||||
# # 正则表达式模式匹配千分位格式的数字和百分比
|
||||
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
|
||||
|
||||
# pattern1 = r"\d+(.\d+)+%?"
|
||||
# # 使用 re.search 函数查找匹配项
|
||||
# match = re.search(pattern1, text)
|
||||
|
||||
# # 如果找到匹配项,返回 True,否则返回 False
|
||||
# return bool(match)
|
||||
|
||||
# def get_clean_text(text):
|
||||
# import re
|
||||
# pattern = r"\([^)]*?\)"
|
||||
# matches = re.findall(pattern, text)
|
||||
# for match in matches:
|
||||
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
|
||||
# month_keywords_found = re.search(r"归属于|扣非", match)
|
||||
# if not month_keywords_found:
|
||||
# # 如果包含,则从文本中删除该部分
|
||||
# text = re.sub(pattern,"", text)
|
||||
# else:
|
||||
# # 如果不包含,删除所有标点符号和中文数字
|
||||
# text = re.sub(r"[^\w\s]", "", text)
|
||||
# print(text)
|
||||
|
||||
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
|
||||
# # #通过向量查询指标
|
||||
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
# # #指标归一化处理
|
||||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
# def print_measure_data(cursor,client):
|
||||
# select_query = '''
|
||||
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
|
||||
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
|
||||
# '''
|
||||
# cursor.execute(select_query)
|
||||
# records = cursor.fetchall()
|
||||
# for record in records:
|
||||
# ori_measure_name = record[0]
|
||||
# measure_name = record[1]
|
||||
# ori_measure_id = record[2]
|
||||
# measure_vector = redis_service.read_from_redis(ori_measure_id)
|
||||
|
||||
# measure_list = ast.literal_eval(measure_vector)
|
||||
# data = [measure_list]
|
||||
# res = client.search(
|
||||
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
|
||||
# # Replace with your query vector
|
||||
# data=data,
|
||||
# limit=2, # Max. number of search results to return
|
||||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
# output_fields=["measure_name","measure_value","table_num","table_index"],
|
||||
# filter = 'file_id == "64"'
|
||||
# )
|
||||
# vector_str = measure_name+":"+ori_measure_name
|
||||
# # Convert the output to a formatted JSON string
|
||||
# for i in range(len(res[0])):
|
||||
|
||||
# vector_distance = float(res[0][i]["distance"])
|
||||
# vector_measure_name = res[0][i]["entity"]["measure_name"]
|
||||
# measure_value = res[0][i]["entity"]["measure_value"]
|
||||
# table_num = res[0][i]["entity"]["table_num"]
|
||||
# table_index = res[0][i]["entity"]["table_index"]
|
||||
# table_num_list = [106]
|
||||
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
|
||||
# # if vector_distance > 0.89 and table_num not in table_num_list:
|
||||
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
|
||||
# # if vector_distance > distance and table_num not in table_num_list:
|
||||
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||||
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
|
||||
# print(vector)
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# measure_config_to_db(conn,cursor)
|
||||
|
||||
insert_measure_vector(conn,cursor)
|
||||
|
||||
# cursor.close()
|
||||
# conn.close()
|
||||
# import re
|
||||
# text = '减少11.04百分点'
|
||||
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
|
||||
# print('找到了单位。')
|
||||
|
||||
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
|
||||
|
||||
# match = unit_pattern.search(text)
|
||||
# print(len(match.groups()))
|
||||
|
||||
# if match:
|
||||
# print(f'找到单位。')
|
||||
# else:
|
||||
# print(f'没有找到单位。')
|
||||
# row1 = ['比例','比率','占比','费用']
|
||||
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
|
||||
|
||||
# for i in range(len(row1)):
|
||||
# for j in range(len(row2)):
|
||||
# print(f"{row1[i]}{row2[j]}")
|
||||
# import os,re
|
||||
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
|
||||
|
||||
# # 获取文件名和扩展名
|
||||
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
|
||||
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
|
||||
# file_base_name = re.sub(r'\d+', '', file_base_name)
|
||||
# print(f'文件名: {file_base_name}')
|
||||
# import re
|
||||
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
|
||||
# import threading
|
||||
|
||||
# # 创建一个ThreadLocal变量
|
||||
# local_data = threading.local()
|
||||
|
||||
# # 定义一个线程执行的工作函数
|
||||
# def worker():
|
||||
# # 为当前线程的ThreadLocal变量设置一个值
|
||||
# local_data.data = f"Thread {threading.current_thread().name}'s data"
|
||||
# print(local_data.data)
|
||||
|
||||
# # 创建并启动多个线程
|
||||
# threads = []
|
||||
# for i in range(3):
|
||||
# thread = threading.Thread(target=worker)
|
||||
# thread.start()
|
||||
# threads.append(thread)
|
||||
|
||||
# # 等待所有线程完成
|
||||
# for thread in threads:
|
||||
# thread.join()
|
||||
# for i in range(2,5):
|
||||
# print(i)
|
||||
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
|
||||
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
|
||||
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||||
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
|
||||
# client = MilvusClient(
|
||||
# uri= MILVUS_CLIENT
|
||||
# )
|
||||
# conn = mysql.connector.connect(
|
||||
# host=MYSQL_HOST,
|
||||
# user=MYSQL_USER,
|
||||
# password=MYSQL_PASSWORD,
|
||||
# database=MYSQL_DB
|
||||
# )
|
||||
# cursor = conn.cursor()
|
||||
# print_measure_data(cursor,client)
|
||||
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
|
||||
# redis_service.read_from_redis()
|
||||
# parent_table_pages = []
|
||||
# file_id = '67'
|
||||
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
|
||||
|
||||
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
# main.get_table_measure(path,'all',file_id)
|
||||
|
||||
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
|
||||
# measure_config_to_db(conn,cursor)
|
||||
# params = ['f_102','f_103',]
|
||||
# for param in params:
|
||||
# globals()[param] = param.replace('f_','')
|
||||
# # insert_measure_vector(conn,cursor)
|
||||
# print(globals()['f_102'])
|
||||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
# conn.commit()
|
||||
# cursor.close()
|
||||
# conn.close()
|
||||
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
|
||||
# count_range_parts = utils.get_range(2300)
|
||||
|
||||
# print(count_range_parts)
|
|
@ -0,0 +1,33 @@
|
|||
MILVUS_CLIENT='http://127.0.0.1:19530'
|
||||
#MILVUS_CLIENT='http://60.204.228.154:19530'
|
||||
# MYSQL_HOST = '121.37.185.246'
|
||||
# MYSQL_PORT = 3306
|
||||
# MYSQL_USER = 'financial'
|
||||
# MYSQL_PASSWORD = 'financial_8000'
|
||||
# MYSQL_DB = 'financial_report'
|
||||
|
||||
NOTIFY_ADDR = 'http://192.168.0.166:8100/api/tenant/report/notify'
|
||||
# NOTIFY_ADDR_ID = 'http://192.168.0.175:8100/api/tenant/info/notify'
|
||||
|
||||
# NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
|
||||
|
||||
REDIS_HOST = '192.168.0.172'
|
||||
# REDIS_HOST = '123.60.153.169'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
FILE_PATH = '/root/pdf_parser/word/'
|
||||
PORT = 8001
|
||||
MEASURE_COUNT = 8
|
||||
|
||||
MYSQL_HOST = '192.168.0.142'#192.168.0.201
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial_prod'
|
||||
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB = 'financial_report_prod'
|
||||
|
||||
MYSQL_HOST_APP = '192.168.0.142'#192.168.0.201
|
||||
MYSQL_PORT_APP = 3306
|
||||
MYSQL_USER_APP = 'financial_prod'
|
||||
MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB_APP = 'financial_report_prod'
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
MILVUS_CLIENT='http://127.0.0.1:19530'
|
||||
MILVUS_HOST = '127.0.0.1'
|
||||
MILVUS_PORT = 19530
|
||||
MYSQL_HOST = '75e59185a2624316882c98206dbe4c49in01.internal.cn-east-3.mysql.rds.myhuaweicloud.com'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial_prod'
|
||||
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB = 'financial_report_prod'
|
||||
NOTIFY_ADDR = 'http://192.168.0.166:8100/api/tenant/report/notify'
|
||||
FILE_PATH = '/root/pdf_parser/pdf/'
|
||||
REDIS_HOST = '192.168.0.172'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
PORT = 8000
|
||||
MEASURE_COUNT = 8
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,119 @@
|
|||
import pandas as pd
|
||||
import json
|
||||
import utils
|
||||
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
import mysql.connector
|
||||
|
||||
# 读取 Excel 文件
|
||||
df = pd.read_excel('/Users/zhengfei/Desktop/cb/ttt.xlsx', header=0)
|
||||
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
year = 2023
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor()
|
||||
|
||||
insert_query = '''
|
||||
INSERT INTO measure_create_config
|
||||
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
|
||||
for data in data_list:
|
||||
show_measure = str(data['指标'])
|
||||
same_mean_measure = str(data['同义表述'])
|
||||
period_measure = str(data['周期'])
|
||||
change_measure = str(data['变动'])
|
||||
black_list = str(data['黑名单词'])
|
||||
config_id = utils.get_md5(show_measure)
|
||||
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
|
||||
cursor.execute(insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 读取 Excel 文件
|
||||
df_period = pd.read_excel('/Users/zhengfei/Desktop/cb/period.xlsx', header=0)
|
||||
|
||||
# 将 DataFrame 转换为字典列表
|
||||
period_list = df_period.to_dict(orient='records')
|
||||
|
||||
period_insert_query = '''
|
||||
INSERT INTO measure_create_period
|
||||
(period_name, same_mean_period)
|
||||
VALUES (%s, %s)
|
||||
'''
|
||||
|
||||
for data in period_list:
|
||||
period_name = str(data['标准表述'])
|
||||
same_mean_period = str(data['同义表述'])
|
||||
|
||||
insert_query_data = (period_name, same_mean_period)
|
||||
cursor.execute(period_insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
data_query = '''
|
||||
SELECT * FROM measure_create_config where delete_status = 0
|
||||
'''
|
||||
period_query = '''
|
||||
SELECT * FROM measure_create_period
|
||||
'''
|
||||
|
||||
cursor.execute(data_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
cursor.execute(period_query)
|
||||
period_list = cursor.fetchall()
|
||||
|
||||
for data in data_list:
|
||||
config_id = data[0]
|
||||
show_measure = data[1]
|
||||
same_mean_measure = data[2]
|
||||
period_measure = data[3]
|
||||
change_measure = data[4]
|
||||
same_mean_measure_arr = []
|
||||
period_measure_arr = []
|
||||
change_measure_arr = []
|
||||
if same_mean_measure != 'nan' :
|
||||
same_mean_measure_arr = same_mean_measure.split(',')
|
||||
same_mean_measure_arr.append(show_measure)
|
||||
if period_measure != 'nan' :
|
||||
period_measure_arr = period_measure.split(',')
|
||||
if change_measure != 'nan' :
|
||||
change_measure_arr = change_measure.split(',')
|
||||
|
||||
for c in change_measure_arr:
|
||||
period_measure_arr.append(c)
|
||||
|
||||
for x in period_measure_arr:
|
||||
if x in change_measure_arr:
|
||||
show_name = show_measure+x
|
||||
else:
|
||||
show_name = x+show_measure
|
||||
for y in same_mean_measure_arr:
|
||||
if x in change_measure:
|
||||
parser_name = y+x
|
||||
else:
|
||||
parser_name = x+y
|
||||
|
||||
print(f'{show_name},{parser_name}')
|
||||
for p in period_list:
|
||||
period_exra_name = p[0]
|
||||
period_exra_value = p[1]
|
||||
if x.find(period_exra_name) != -1:
|
||||
for v in period_exra_value.split(','):
|
||||
if x in change_measure:
|
||||
parser_name = y + x.replace(period_exra_name, v)
|
||||
else:
|
||||
parser_name = x.replace(period_exra_name, v) + y
|
||||
print(f'{show_name},{parser_name}')
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
|
@ -0,0 +1,72 @@
|
|||
import pandas as pd
|
||||
import json
|
||||
import utils
|
||||
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
import mysql.connector
|
||||
|
||||
|
||||
def getId(name):
|
||||
categorys = [
|
||||
{
|
||||
name: "术后康复",
|
||||
id: 1
|
||||
},
|
||||
{
|
||||
name: "运动损伤康复",
|
||||
id: 2
|
||||
},
|
||||
{
|
||||
name: "慢病康复",
|
||||
id: 3
|
||||
},
|
||||
{
|
||||
name: "运动训练",
|
||||
id: 4
|
||||
},
|
||||
{
|
||||
name: "健康科普",
|
||||
id: 5
|
||||
},
|
||||
]
|
||||
object_dict = {obj[name]: obj[id] for obj in categorys}
|
||||
if name in object_dict:
|
||||
return object_dict[name]
|
||||
else:
|
||||
return 6
|
||||
|
||||
# 读取 Excel 文件
|
||||
df = pd.read_excel('/Users/zhengfei/Desktop/book.xlsx', header=0)
|
||||
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host = 'rm-bp1vns6jjy6yu46lhio.mysql.rds.aliyuncs.com',
|
||||
user = 'hematiyu',
|
||||
password = '00a09f971769499f8c0495505ab0922C',
|
||||
database = 'km'
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor()
|
||||
|
||||
for data in data_list:
|
||||
print(data)
|
||||
book_name = str(data['书名']).replace('\n', '')
|
||||
category = str(data['分类'])
|
||||
category_name = category.split(',')[0]
|
||||
|
||||
category = getId(category_name)
|
||||
keywords = str(data['关键词'])
|
||||
if keywords == 'nan':
|
||||
keywords = ''
|
||||
insert_query = '''
|
||||
update km_doc set category = {category},keywords = '{keywords}',source = 1 where title = '{book_name}'
|
||||
'''.format(book_name=book_name, category=category, keywords=keywords)
|
||||
print(insert_query)
|
||||
cursor.execute(insert_query)
|
||||
conn.commit()
|
||||
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
|
@ -0,0 +1,22 @@
|
|||
import os
|
||||
import shutil
|
||||
|
||||
# 数组列表中的数字编号
|
||||
numbers = ['837242','830839','837212','830832','430510','835670','837092','831689','832419','831278','838171','834261','430476','831195','872190','833394','872953','831304','832651','873132','832651','600060','600076','600180','600188','600212','600219','600223','600229','600308','600309','600319','600336','600350','600426','600448','600467','600529','600547','600579','600586','600587','600600','600690','600727','600735','600756','600760','600777','600783','600784','600789','600804','600807','600858','600898','600960','600966','600022','600027','600017','601678','601058','601028','603167','603798','603779','603421','603612','603021','601366','603367','601966','603029','603639','603026','603858','603223','601163','603708','603577','603086','603638','603217','603536','603113','603586','603856','601019','600918','603967','605006','603278','603279','603755','603739','601298','603187','605198','688002','605001','605100','601665','603102','688579','688309','605287','605016','688556','605589','688677','688191','688663','688681','600955','603836','688087','605567','603182','603190','603151','601096','688695','603270','603285','688002','688139','688363','688021','688579','688309','688556','688557','688677','688136','688191','688161','688501','688663','688681','688087','688190','688234','688331','688455','688035','688695','000407','000409','000423','000488','000498','000503','000506','000554','000599','000639','000655','000668','000677','000680','000682','000720','000726','000756','000811','000822','000830','000869','000880','000915','000951','000957','000977','002026','002041','002073','002078','002083','002086','002088','002094','002107','002111','002117','000338','002254','002193','002237','002234','002242','002241','002248','002270','002283','002286','002490','300001','002323','002330','002339','002355','002353','002374','002376','002363','002379','002382','002408','300105','002469','002458','300099','300321','300110','002476','002470','002595','002481','300121','002485','002521','002498','002643','002805','300143','002526','002537','300175','002545','300183','300185','300214','002581','300208','002580','300224','002589','300233','300237','002588','002598','300243','300285','002655','002675','300308','002671','300343','002746','002838','002726','002768','300443','300479','002793','300423','002810','300569','300659','300583','002871','300996','300699','300801','300653','300690','002890','300677','002891','003033','002921','002899','002958','002948','301017','300786','300848','301035','300654','300594','300779','301015','002984','301020','300830','300821','300840','003022','301199','301299','300918','300950','300948','300993','003042','001207','301022','001219','301069','301185','301149','301188','301296','301158','301439','301206','301262','301209','301320','301281','301337','301456','001260','001300','001331','301292','301498','001379']
|
||||
# 源目录和目标目录
|
||||
source_dir = '/Users/zhengfei/Desktop/cb'
|
||||
target_dir = '/Users/zhengfei/Desktop/sandong'
|
||||
|
||||
# 遍历源目录
|
||||
for root, dirs, files in os.walk(source_dir):
|
||||
for file in files:
|
||||
numbers.remove
|
||||
# # 检查文件名中是否包含数组列表中的某个值
|
||||
if any(str(number) in file for number in numbers):
|
||||
# 获取文件完整路径
|
||||
file_path = os.path.join(root, file)
|
||||
# 获取目标目录的完整路径
|
||||
target_path = os.path.join(target_dir, file)
|
||||
# 拷贝文件
|
||||
shutil.copy2(file_path, target_path)
|
||||
print(f"文件 {file_path} 已拷贝到 {target_dir}")
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,99 @@
|
|||
#coding=utf-8
|
||||
|
||||
import random
|
||||
from http import HTTPStatus
|
||||
from dashscope import Generation
|
||||
from datetime import datetime
|
||||
|
||||
# 文本和表格数据给大模型,返回大模型抽取原始指标列表
|
||||
def get_measure_from_llm(user_prompt):
|
||||
"""
|
||||
:return: 文本和表格数据给大模型,返回大模型抽取原始指标列表
|
||||
"""
|
||||
llm_measure_list = []
|
||||
system_prompt = '''
|
||||
你是一个优秀的金融分析师,从给定的数据报告中自动提取以下关键财务指标。指标包括:
|
||||
2023年营业收入
|
||||
2022年营业收入
|
||||
2021年营业收入
|
||||
2023年第一季度营业收入
|
||||
2023年第二季度营业收入
|
||||
2023年第三季度营业收入
|
||||
2023年第四季度营业收入
|
||||
营业收入同比变动
|
||||
2023年归母净利润
|
||||
2022年归母净利润
|
||||
2021年归母净利润
|
||||
2023年第一季度归母净利润
|
||||
2023年第二季度归母净利润
|
||||
2023年第三季度归母净利润
|
||||
2023年第四季度归母净利润
|
||||
归母净利润同比变动
|
||||
2023年扣非净利润
|
||||
2022年扣非净利润
|
||||
2021年扣非净利润
|
||||
2023年第一季度扣非净利润
|
||||
2023年第二季度扣非净利润
|
||||
2023年第三季度扣非净利润
|
||||
2023年第四季度扣非净利润
|
||||
扣非净利润同比变动
|
||||
2023年经营活动现金流净额
|
||||
2022年经营活动现金流净额
|
||||
2021年经营活动现金流净额
|
||||
经营活动现金流净额同比变动
|
||||
2023年筹资活动现金流净额
|
||||
2022年筹资活动现金流净额
|
||||
2021年筹资活动现金流净额
|
||||
2023年投资活动现金流净额
|
||||
2022年投资活动现金流净额
|
||||
2021年投资活动现金流净额
|
||||
2023年非经常性损益
|
||||
2022年非经常性损益
|
||||
2021年非经常性损益
|
||||
2023年基本每股收益
|
||||
2022年基本每股收益
|
||||
2021年基本每股收益
|
||||
2023年稀释每股收益
|
||||
2022年稀释每股收益
|
||||
2021年稀释每股收益
|
||||
2023年加权平均净资产收益率
|
||||
2022年加权平均净资产收益率
|
||||
2021年加权平均净资产收益率
|
||||
2023年扣非加权平均净资产收益率
|
||||
2022年扣非加权平均净资产收益率
|
||||
2021年扣非加权平均净资产收益率
|
||||
请确保只抽取这些指标,并且每个指标的输出格式为:指标名:指标值,只需要按格式输出,不要增加其他内容。所有的指标值必须从用户提供的信息中抽取,不允许自己生成,如果找不到相关指标,指标值显示为-
|
||||
<数据报告>
|
||||
<user_prompt>
|
||||
</数据报告>
|
||||
'''
|
||||
system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
|
||||
response = Generation.call(
|
||||
model='qwen-plus',
|
||||
prompt = system_prompt,
|
||||
seed=random.randint(1, 10000),
|
||||
top_p=0.8,
|
||||
result_format='message',
|
||||
enable_search=False,
|
||||
max_tokens=1500,
|
||||
temperature=0.85,
|
||||
repetition_penalty=1.0
|
||||
)
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
result = response['output']['choices'][0]['message']['content']
|
||||
llm_measure_list = result.split('\n')
|
||||
return llm_measure_list
|
||||
else:
|
||||
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
response.request_id, response.status_code,
|
||||
response.code, response.message
|
||||
))
|
||||
|
||||
return "llm_error"
|
||||
|
||||
if __name__ == '__main__':
|
||||
user_prompt = '''
|
||||
二、 经营情况回顾 (一) 经营计划 2023 年,在国际环境复杂多变以及全球经济依旧下行的形势下,公司严格按照既定发展战略和经营计划,狠抓落实,迎难而上,业务经营整体保持稳定,如期完成全年既定经营目标。在全体职员的共同努力下,公司的营业收入、净利润等各项指标再创历史新高,营业收入较上年同期实现15.43%的增长,归属于上市公司股东的净利润较上年同期实现 26.47%的增长。 1、财务状况 报告期末,公司资产总额为 1,473,271,310.23 元,增幅为 19.17%,主要系:一方面随着销售规模的不断增长,公司应收账款及合同资产等流动资产增幅较大,另一方面,为解决基于销售规模扩大引致的产能跟不上的瓶颈,公司上马扩产建设项目,导致在建工程、固定资产等非流动资产增幅较报告期末公司负债总额为 800,619,067.70 元,增幅为 26.12%,主要系随着销售规模增加、工程建设项目推进、固定资产购置等,公司采购数额大幅增加,公司通过银行借款等方式筹集资金,导致长短期贷款期末余额增幅较大。 报告期末,归属于上市公司股东的净资产为 670,316,339.35 元,增幅为 11.45%,主要系报告期内经营积累。 2、经营成果 报告期内,公司实现营业收入 1,003,535,799.51 元,增幅为 15.43%。主要系公司本期持续优化生产经营,大力推进产品研发和创新,抓住“双碳”政策以及“能效”提升产生的市场需求旺盛的有利时机,且随着公司北交所上市,产品品牌效应凸显,产能增加,订单获取能力增强,变压器及户外成套设备销售增长较多。 营业成本为 810,779,075.89 元,增幅为 15.33%,主要系报告期内销售增长及主要原材料价格变动所致。归属于上市公司股东的净利润为 73,033,633.31 元,增幅为 26.47%,主要系:1)公司持续优化生产经营,大力推进产品研发和创新,抓住“双碳”政策以及“能效”提升产生的市场需求旺盛的有利时机,生产和销售均呈稳定增长;2)本期处置开源路 1-1 号土地及建筑物及其他附属物等,结转资产处置收益同比增加。
|
||||
'''
|
||||
measure_list = get_measure_from_llm(user_prompt)
|
||||
print(measure_list)
|
|
@ -0,0 +1,204 @@
|
|||
2024-12-29 16:13:29,975|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:13:36,106|zzb_logger : INFO 任务 201917 完成
|
||||
2024-12-29 16:15:16,205|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:15:22,356|zzb_logger : INFO 任务 201917 完成
|
||||
2024-12-29 16:17:15,693|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
|
||||
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:17:25,319|zzb_logger : INFO text,任务ID:201917
|
||||
2024-12-29 16:17:26,701|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (5116)...
|
||||
2024-12-29 16:17:28,173|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (22268)...
|
||||
2024-12-29 16:17:29,591|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (27736)...
|
||||
2024-12-29 16:17:30,937|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38276)...
|
||||
2024-12-29 16:17:32,294|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38292)...
|
||||
2024-12-29 16:17:33,664|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38240)...
|
||||
2024-12-29 16:17:35,153|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28536)...
|
||||
2024-12-29 16:17:36,559|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37552)...
|
||||
2024-12-29 16:17:37,929|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37856)...
|
||||
2024-12-29 16:17:39,291|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (10528)...
|
||||
2024-12-29 16:17:40,688|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (31444)...
|
||||
2024-12-29 16:17:42,133|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (11108)...
|
||||
2024-12-29 16:17:43,518|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23236)...
|
||||
2024-12-29 16:17:44,901|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23572)...
|
||||
2024-12-29 16:17:46,495|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (39604)...
|
||||
2024-12-29 16:17:47,899|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (4076)...
|
||||
2024-12-29 16:17:47,899|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:18:02,194|zzb_logger : INFO word表格中 text解析完成,任务ID:201917
|
||||
2024-12-29 16:18:02,196|zzb_logger : INFO 开始解析word表表格中的table,任务ID:201917
|
||||
2024-12-29 16:18:03,525|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36176)...
|
||||
2024-12-29 16:18:04,585|zzb_logger : INFO Task 解析表格201917 runs 1.06 seconds.
|
||||
2024-12-29 16:18:04,873|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35368)...
|
||||
2024-12-29 16:18:05,769|zzb_logger : INFO Task 解析表格201917 runs 0.90 seconds.
|
||||
2024-12-29 16:18:06,263|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (33004)...
|
||||
2024-12-29 16:18:07,225|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
|
||||
2024-12-29 16:18:07,628|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (30764)...
|
||||
2024-12-29 16:18:08,427|zzb_logger : INFO Task 解析表格201917 runs 0.80 seconds.
|
||||
2024-12-29 16:18:08,976|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (29608)...
|
||||
2024-12-29 16:18:09,864|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:10,588|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (5404)...
|
||||
2024-12-29 16:18:11,360|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
|
||||
2024-12-29 16:18:11,966|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36200)...
|
||||
2024-12-29 16:18:12,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36328)...
|
||||
2024-12-29 16:18:12,892|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
|
||||
2024-12-29 16:18:13,034|zzb_logger : INFO Task 解析表格201917 runs 1.00 seconds.
|
||||
2024-12-29 16:18:13,392|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39712)...
|
||||
2024-12-29 16:18:14,166|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
|
||||
2024-12-29 16:18:15,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17184)...
|
||||
2024-12-29 16:18:15,084|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38828)...
|
||||
2024-12-29 16:18:15,156|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39596)...
|
||||
2024-12-29 16:18:15,194|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36908)...
|
||||
2024-12-29 16:18:15,268|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38088)...
|
||||
2024-12-29 16:18:15,273|zzb_logger : INFO 解析表格时出现了异常 setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part. 内容为{'type': 'table', 'index': 1438, 'data': [['项目', '期末', '期末', '期末', '期末', '期末', '期初', '期初', '期初', '期初', '期初', '期初', '期初', '期初'], ['', '账面余额', '账面价值', '受限类型', '受限情况', '受限情况', '账面余额', '账面余额', '账面价值', '账面价值', '受限类型', '受限类型', '受限情况', ''], ['货币资金', '485,532.72', '485,532.72', '', '住房专用基金', '住房专用基金', '482,151.75', '482,151.75', '482,151.75', '482,151.75', '', '', '住房专用基金', ''], ['固定资产', '9,798,299.46', '9,798,299.46', '', '金融机构借款抵押', '3,747,470.09', '3,747,470.09', '3,747,470.09', '3,747,470.09', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['无形资产', '7,982,261.87', '7,982,261.87', '', '金融机构借款抵押', '5,437,462.92', '5,437,462.92', '5,437,462.92', '5,437,462.92', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['货币资金', '43,997,452.57', '43,997,452.57', '', '银行保证金', '63,388,483.00', '63,388,483.00', '63,388,483.00', '63,388,483.00', '', '', '银行保证金', '银行保证金'], ['投资性房地产', '62,041,831.52', '62,041,831.52', '', '金融机构借款抵押', '67,653,392.10', '67,653,392.10', '67,653,392.10', '67,653,392.10', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['合计', '124,305,378.14', '124,305,378.14', '', '', '140,708,959.86', '140,708,959.86', '140,708,959.86', '140,708,959.86', '', '', '', '']]}
|
||||
2024-12-29 16:18:15,722|zzb_logger : INFO Task 解析表格201917 runs 0.69 seconds.
|
||||
2024-12-29 16:18:15,873|zzb_logger : INFO Task 解析表格201917 runs 0.79 seconds.
|
||||
2024-12-29 16:18:16,067|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
|
||||
2024-12-29 16:18:16,086|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:16,158|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:16,787|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39052)...
|
||||
2024-12-29 16:18:16,847|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35928)...
|
||||
2024-12-29 16:18:17,456|zzb_logger : INFO Task 解析表格201917 runs 0.61 seconds.
|
||||
2024-12-29 16:18:17,644|zzb_logger : INFO Task 解析表格201917 runs 0.86 seconds.
|
||||
2024-12-29 16:18:17,819|zzb_logger : INFO word表格中 table解析完成,任务ID:201917
|
||||
2024-12-29 16:18:17,985|zzb_logger : INFO 解析任务 201917 完成,耗时62.29 秒。
|
||||
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
|
||||
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:18:18,107|zzb_logger : INFO 开始表格指标抽取,任务ID:201917
|
||||
2024-12-29 16:18:20,187|zzb_logger : INFO 提取指标任务 0-10 (29656)...
|
||||
2024-12-29 16:18:21,575|zzb_logger : INFO 提取指标任务 10-20 (38952)...
|
||||
2024-12-29 16:18:22,849|zzb_logger : INFO 提取指标任务 20-30 (31900)...
|
||||
2024-12-29 16:18:24,192|zzb_logger : INFO 提取指标任务 30-40 (30420)...
|
||||
2024-12-29 16:18:25,554|zzb_logger : INFO 提取指标任务 40-50 (32448)...
|
||||
2024-12-29 16:18:26,909|zzb_logger : INFO 提取指标任务 50-60 (37708)...
|
||||
2024-12-29 16:18:28,305|zzb_logger : INFO 提取指标任务 60-70 (36136)...
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,936|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:29,637|zzb_logger : INFO 提取指标任务 70-80 (39120)...
|
||||
2024-12-29 16:18:42,814|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:46,511|zzb_logger : INFO 提取指标 40-50 runs 20.96 seconds.
|
||||
2024-12-29 16:18:54,027|zzb_logger : INFO 提取指标 70-80 runs 24.39 seconds.
|
||||
2024-12-29 16:19:17,236|zzb_logger : INFO 提取指标 60-70 runs 48.93 seconds.
|
||||
2024-12-29 16:19:20,151|zzb_logger : INFO 提取指标 30-40 runs 55.96 seconds.
|
||||
2024-12-29 16:19:40,383|zzb_logger : INFO 提取指标 50-60 runs 73.47 seconds.
|
||||
2024-12-29 16:20:06,573|zzb_logger : INFO 提取指标 0-10 runs 106.39 seconds.
|
||||
2024-12-29 16:20:44,937|zzb_logger : INFO 提取指标 10-20 runs 143.36 seconds.
|
||||
2024-12-29 16:20:50,959|zzb_logger : INFO 提取指标 20-30 runs 148.11 seconds.
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取完成,任务ID:201917
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取 201917 完成,耗时153.23 秒。
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
|
||||
2024-12-29 16:20:51,549|zzb_logger : INFO 目录黑名单为:[]
|
||||
2024-12-29 16:20:52,316|zzb_logger : INFO 向量配置数据查询 0.11 秒。
|
||||
2024-12-29 16:20:52,317|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
|
||||
2024-12-29 16:20:54,191|zzb_logger : INFO Run task 0-351 (41216)...
|
||||
2024-12-29 16:20:54,192|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:54,742|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:55,664|zzb_logger : INFO Run task 351-702 (16388)...
|
||||
2024-12-29 16:20:55,664|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:56,152|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:57,120|zzb_logger : INFO Run task 702-1053 (41796)...
|
||||
2024-12-29 16:20:57,120|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:57,611|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:58,818|zzb_logger : INFO Run task 1053-1404 (39320)...
|
||||
2024-12-29 16:20:58,818|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:59,324|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:00,159|zzb_logger : INFO Run task 1404-1755 (41868)...
|
||||
2024-12-29 16:21:00,159|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:00,887|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:01,473|zzb_logger : INFO Run task 1755-2106 (26816)...
|
||||
2024-12-29 16:21:01,473|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:02,171|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:02,832|zzb_logger : INFO Run task 2106-2457 (32120)...
|
||||
2024-12-29 16:21:02,832|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:03,703|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO Run task 2457-2815 (38332)...
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:04,886|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:23:00,285|zzb_logger : INFO 所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:23:00,286|zzb_logger : INFO 启动指标归一化任务ID:201917
|
||||
2024-12-29 16:23:00,286|zzb_logger : INFO 向量更新时间 127.97 秒。
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据查询 0.17 秒。
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO update_ori_measure方法走的是半年报
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据更新 0.00 秒。
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 更新数据写入 0.05 秒。
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 归一化完成任务ID:201917
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 任务 201917 完成,耗时344.83 秒。
|
||||
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
|
||||
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:23:00,821|zzb_logger : INFO 任务 201917 完成
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,162 @@
|
|||
2024-11-25 15:33:22,588|zzb_logger : INFO 开始启动文件解析任务: 103.docx
|
||||
2024-11-25 15:33:22,593|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
|
||||
2024-11-25 15:33:22,593|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-11-25 15:33:28,433|zzb_logger : INFO text,任务ID:201917
|
||||
2024-11-25 15:33:29,616|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (14328)...
|
||||
2024-11-25 15:33:31,068|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28108)...
|
||||
2024-11-25 15:33:32,200|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (19476)...
|
||||
2024-11-25 15:33:33,366|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (17332)...
|
||||
2024-11-25 15:33:34,692|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23168)...
|
||||
2024-11-25 15:33:35,803|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (26276)...
|
||||
2024-11-25 15:33:36,919|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (20716)...
|
||||
2024-11-25 15:33:38,051|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (1760)...
|
||||
2024-11-25 15:33:39,160|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (13296)...
|
||||
2024-11-25 15:33:40,302|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (8592)...
|
||||
2024-11-25 15:33:41,406|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (20664)...
|
||||
2024-11-25 15:33:42,511|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (21840)...
|
||||
2024-11-25 15:33:43,619|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (19108)...
|
||||
2024-11-25 15:33:44,744|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (29096)...
|
||||
2024-11-25 15:33:45,854|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (17024)...
|
||||
2024-11-25 15:33:47,001|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (18668)...
|
||||
2024-11-25 15:33:47,001|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-11-25 15:34:03,934|zzb_logger : INFO word表格中 text解析完成,任务ID:201917
|
||||
2024-11-25 15:34:03,936|zzb_logger : INFO 开始解析word表表格中的table,任务ID:201917
|
||||
2024-11-25 15:34:05,071|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (7472)...
|
||||
2024-11-25 15:34:06,182|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (19500)...
|
||||
2024-11-25 15:34:06,445|zzb_logger : INFO Task 解析表格201917 runs 1.37 seconds.
|
||||
2024-11-25 15:34:07,083|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-11-25 15:34:07,641|zzb_logger : INFO Task 解析表格201917 runs 1.46 seconds.
|
||||
2024-11-25 15:34:08,265|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (20888)...
|
||||
2024-11-25 15:34:08,386|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (28568)...
|
||||
2024-11-25 15:34:08,464|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (26716)...
|
||||
2024-11-25 15:34:08,592|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (27376)...
|
||||
2024-11-25 15:34:08,663|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (12360)...
|
||||
2024-11-25 15:34:08,791|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (28692)...
|
||||
2024-11-25 15:34:08,797|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (11684)...
|
||||
2024-11-25 15:34:08,892|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (21064)...
|
||||
2024-11-25 15:34:08,948|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (24608)...
|
||||
2024-11-25 15:34:08,994|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (8632)...
|
||||
2024-11-25 15:34:09,098|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (23436)...
|
||||
2024-11-25 15:34:09,138|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (15992)...
|
||||
2024-11-25 15:34:09,176|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (9844)...
|
||||
2024-11-25 15:34:09,219|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17936)...
|
||||
2024-11-25 15:34:09,298|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
|
||||
2024-11-25 15:34:09,399|zzb_logger : INFO Task 解析表格201917 runs 1.13 seconds.
|
||||
2024-11-25 15:34:09,428|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
|
||||
2024-11-25 15:34:09,565|zzb_logger : INFO Task 解析表格201917 runs 0.97 seconds.
|
||||
2024-11-25 15:34:09,637|zzb_logger : INFO Task 解析表格201917 runs 0.84 seconds.
|
||||
2024-11-25 15:34:09,963|zzb_logger : INFO Task 解析表格201917 runs 1.01 seconds.
|
||||
2024-11-25 15:34:10,020|zzb_logger : INFO Task 解析表格201917 runs 1.23 seconds.
|
||||
2024-11-25 15:34:10,036|zzb_logger : INFO Task 解析表格201917 runs 1.37 seconds.
|
||||
2024-11-25 15:34:10,073|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
|
||||
2024-11-25 15:34:10,168|zzb_logger : INFO Task 解析表格201917 runs 1.28 seconds.
|
||||
2024-11-25 15:34:10,223|zzb_logger : INFO Task 解析表格201917 runs 1.12 seconds.
|
||||
2024-11-25 15:34:10,265|zzb_logger : INFO Task 解析表格201917 runs 1.27 seconds.
|
||||
2024-11-25 15:34:10,304|zzb_logger : INFO Task 解析表格201917 runs 1.13 seconds.
|
||||
2024-11-25 15:34:10,404|zzb_logger : INFO Task 解析表格201917 runs 1.18 seconds.
|
||||
2024-11-25 15:34:10,557|zzb_logger : INFO word表格中 table解析完成,任务ID:201917
|
||||
2024-11-25 15:34:10,728|zzb_logger : INFO 解析任务 201917 完成,耗时48.14 秒。
|
||||
2024-11-25 15:34:10,879|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
|
||||
2024-11-25 15:34:10,879|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-11-25 15:34:10,879|zzb_logger : INFO 开始表格指标抽取,任务ID:201917
|
||||
2024-11-25 15:34:12,902|zzb_logger : INFO 提取指标任务 0-8 (20908)...
|
||||
2024-11-25 15:34:13,964|zzb_logger : INFO 提取指标任务 8-16 (23592)...
|
||||
2024-11-25 15:34:15,047|zzb_logger : INFO 提取指标任务 16-24 (12664)...
|
||||
2024-11-25 15:34:16,203|zzb_logger : INFO 提取指标任务 24-32 (29872)...
|
||||
2024-11-25 15:34:17,576|zzb_logger : INFO 提取指标任务 32-40 (28748)...
|
||||
2024-11-25 15:34:18,385|zzb_logger : INFO 提取指标任务 40-48 (2204)...
|
||||
2024-11-25 15:34:19,517|zzb_logger : INFO 提取指标任务 48-56 (22344)...
|
||||
2024-11-25 15:34:20,613|zzb_logger : INFO 提取指标任务 56-66 (18352)...
|
||||
2024-11-25 15:34:26,136|zzb_logger : INFO 提取指标 48-56 runs 6.62 seconds.
|
||||
2024-11-25 15:34:36,392|zzb_logger : INFO 提取指标 24-32 runs 20.19 seconds.
|
||||
2024-11-25 15:34:43,329|zzb_logger : INFO 提取指标 56-66 runs 22.72 seconds.
|
||||
2024-11-25 15:34:47,575|zzb_logger : INFO 提取指标 40-48 runs 29.19 seconds.
|
||||
2024-11-25 15:34:56,075|zzb_logger : INFO 提取指标 16-24 runs 41.03 seconds.
|
||||
2024-11-25 15:34:59,737|zzb_logger : INFO 提取指标 32-40 runs 42.16 seconds.
|
||||
2024-11-25 15:35:26,785|zzb_logger : INFO 提取指标 0-8 runs 73.88 seconds.
|
||||
2024-11-25 15:36:47,235|zzb_logger : INFO 提取指标 8-16 runs 153.27 seconds.
|
||||
2024-11-25 15:36:47,522|zzb_logger : INFO 表格指标抽取完成,任务ID:201917
|
||||
2024-11-25 15:36:47,522|zzb_logger : INFO 表格指标抽取 201917 完成,耗时156.64 秒。
|
||||
2024-11-25 15:36:47,523|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
|
||||
2024-11-25 15:36:47,750|zzb_logger : INFO 目录黑名单为:[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943]
|
||||
2024-11-25 15:36:48,656|zzb_logger : INFO 向量配置数据查询 0.41 秒。
|
||||
2024-11-25 15:36:48,658|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
|
||||
2024-11-25 15:36:49,797|zzb_logger : INFO Run task 0-351 (6964)...
|
||||
2024-11-25 15:36:49,797|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:50,291|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:50,925|zzb_logger : INFO Run task 351-702 (17576)...
|
||||
2024-11-25 15:36:50,925|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:51,324|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:52,083|zzb_logger : INFO Run task 702-1053 (1308)...
|
||||
2024-11-25 15:36:52,083|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:52,569|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:53,251|zzb_logger : INFO Run task 1053-1404 (24420)...
|
||||
2024-11-25 15:36:53,251|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:54,430|zzb_logger : INFO Run task 1404-1755 (27824)...
|
||||
2024-11-25 15:36:54,430|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:55,150|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:55,181|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:55,608|zzb_logger : INFO Run task 1755-2106 (22624)...
|
||||
2024-11-25 15:36:55,608|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:56,069|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:56,789|zzb_logger : INFO Run task 2106-2457 (23664)...
|
||||
2024-11-25 15:36:56,789|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:57,633|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:36:58,127|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-11-25 15:36:58,127|zzb_logger : INFO Run task 2457-2815 (10160)...
|
||||
2024-11-25 15:36:58,127|zzb_logger : INFO 插入数据 2815
|
||||
2024-11-25 15:36:58,816|zzb_logger : INFO 黑名单的值是[146, 170, 183, 190, 339, 403, 411, 417, 427, 449, 450, 549, 551, 553, 556, 575, 594, 604, 606, 607, 608, 611, 612, 613, 614, 615, 617, 618, 619, 620, 621, 626, 633, 641, 737, 738, 742, 747, 1138, 1141, 1144, 1147, 1150, 1154, 1156, 1160, 1163, 1166, 1169, 1173, 1257, 1587, 1588, 1697, 1723, 1727, 1770, 1801, 1814, 1874, 1934, 1943, 146, 405, 413, 419, 190, 430, 449, 1590, 1725, 594, 1947, 1815, 1817]和['145', '146', '263', '489', '69', '78', '497', '83', '498', '604', '606', '85', '174', '502', '87', '610', '90', '292', '175', '611', '508', '177', '293', '612', '178', '613', '405', '295', '296', '614', '615', '298', '514', '182', '617', '413', '859', '618', '619', '620', '521', '731', '862', '621', '419', '190', '308', '310', '870', '626', '741', '430', '1010', '877', '533', '1013', '880', '534', '1149', '883', '1014', '536', '758', '1152', '537', '886', '539', '1019', '1289', '1158', '892', '542', '544', '641', '213', '449', '1162', '333', '550', '1165', '220', '905', '552', '1168', '777', '222', '224', '339', '1171', '783', '1585', '226', '341', '1311', '1177', '230', '924', '1590', '929', '1319', '566', '567', '802', '1593', '570', '1328', '358', '1721', '805', '478', '1725', '1597', '808', '1465', '481', '672', '952', '815', '482', '1729', '484', '485', '958', '486', '1607', '1474', '1989', '821', '586', '1350', '824', '969', '1738', '828', '1873', '592', '1875', '594', '689', '1093', '982', '690', '838', '600', '2008', '843', '993', '2013', '996', '700', '703', '1248', '1768', '1770', '2035', '1259', '1772', '713', '1773', '2041', '1776', '1140', '1932', '1778', '1143', '1780', '1146', '1938', '2052', '1781', '1782', '1947', '1552', '1427', '1950', '2065', '1556', '1953', '1960', '1561', '1684', '2075', '1966', '1969', '1689', '1690', '1973', '1692', '1808', '1576', '1701', '2091', '1580', '1815', '1702', '1817', '1820', '1709', '1821', '1714', '1825', '1835', '1843']以及新增的[]
|
||||
2024-11-25 15:39:18,387|zzb_logger : INFO 所有子任务完成,任务ID:201917
|
||||
2024-11-25 15:39:18,387|zzb_logger : INFO 启动指标归一化任务ID:201917
|
||||
2024-11-25 15:39:18,387|zzb_logger : INFO 向量更新时间 149.73 秒。
|
||||
2024-11-25 15:39:18,548|zzb_logger : INFO 更新数据查询 0.14 秒。
|
||||
2024-11-25 15:39:18,548|zzb_logger : INFO update_ori_measure方法走的是半年报
|
||||
2024-11-25 15:39:18,548|zzb_logger : INFO 更新数据更新 0.00 秒。
|
||||
2024-11-25 15:39:18,625|zzb_logger : INFO 更新数据写入 0.08 秒。
|
||||
2024-11-25 15:39:18,625|zzb_logger : INFO 归一化完成任务ID:201917
|
||||
2024-11-25 15:39:18,625|zzb_logger : INFO 任务 201917 完成,耗时356.04 秒。
|
||||
2024-11-25 15:39:18,811|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
|
||||
2024-11-25 15:39:18,811|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-11-25 15:39:18,968|zzb_logger : INFO 任务 201917 完成
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,151 @@
|
|||
import camelot
|
||||
import re
|
||||
from multiprocessing import Pool
|
||||
import os, time, random
|
||||
import json
|
||||
from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
from datetime import datetime
|
||||
# 读取PDF
|
||||
import PyPDF2
|
||||
# 分析PDF的layout,提取文本
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextBoxHorizontal
|
||||
import pdfplumber
|
||||
import mysql.connector
|
||||
import db_service
|
||||
from multiprocessing import Process
|
||||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import utils
|
||||
|
||||
def text_in_table(top, tables_range, page_num):
|
||||
if tables_range.get(page_num):
|
||||
for range in tables_range[page_num]:
|
||||
if top < range['top'] and top > range['buttom']:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_text_type(text: str):
|
||||
text = re.sub(r"\s", "", text)
|
||||
first_re = '年度报告'
|
||||
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
||||
|
||||
if re.search(first_re, text.strip()):
|
||||
return 'page_header'
|
||||
|
||||
if page_number_pattern.match(text.strip()):
|
||||
return 'page_footer'
|
||||
|
||||
if len(text) < 20 and text.endswith('页'):
|
||||
return 'page_footer'
|
||||
|
||||
return 'text'
|
||||
|
||||
# 读取pdf文件中文本内容,不包括表格
|
||||
def get_text_content(pdf_path,file_id,tables_range,conn,cursor):
|
||||
"""
|
||||
:return: 返回pdf文件中文本内容,不包括表格
|
||||
"""
|
||||
# 我们从PDF中提取页面,page_numbers=[4,5,6]
|
||||
for pagenum, page in enumerate(extract_pages(pdf_path)):
|
||||
try:
|
||||
# 找到所有的元素
|
||||
page_elements = [(element.y1, element) for element in page._objs]
|
||||
# 查找组成页面的元素
|
||||
for i,component in enumerate(page_elements):
|
||||
try:
|
||||
# 提取页面布局的元素
|
||||
element = component[1]
|
||||
# 检查该元素是否为文本元素
|
||||
if isinstance(element, LTTextBoxHorizontal):
|
||||
|
||||
# element_top = element.bbox[3]
|
||||
print(element)
|
||||
line_text = element.get_text().replace('\n','')
|
||||
line_text = re.sub(r"\s", "", line_text)
|
||||
if delete_flag(line_text):
|
||||
continue
|
||||
|
||||
# if not text_in_table(element_top, tables_range, pagenum+1):
|
||||
db_service.insert_pdf_text_info({
|
||||
'file_id': file_id,
|
||||
'page_num' : pagenum+1,
|
||||
'text' : line_text
|
||||
},conn,cursor)
|
||||
except Exception as e:
|
||||
print(f'{pagenum}页{i}处理异常')
|
||||
print(e)
|
||||
|
||||
except Exception as e:
|
||||
print(f'{pagenum}页处理异常')
|
||||
print(e)
|
||||
|
||||
def delete_flag(text : str):
|
||||
if utils.under_non_alpha_ratio(text):
|
||||
return True
|
||||
|
||||
if not re.findall(',|,|。|、|(|)',text):
|
||||
return True
|
||||
|
||||
if text.find('适用') != -1 and text.find('不适用') != -1:
|
||||
return True
|
||||
|
||||
if text.find('是') != -1 and text.find('否') != -1:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_table_range(file_path, file_id, pages, tables_range):
|
||||
|
||||
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
|
||||
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
|
||||
for t in tables:
|
||||
|
||||
top = t._bbox[3]
|
||||
buttom = t._bbox[1]
|
||||
page_num = int(t.page)
|
||||
table_index = int(t.order)
|
||||
|
||||
if not tables_range.get(page_num):
|
||||
tables_range[page_num] = []
|
||||
|
||||
tables_range[page_num].append({
|
||||
'top' : top,
|
||||
'buttom' : buttom,
|
||||
'table_index' : table_index,
|
||||
'page_num' : page_num,
|
||||
})
|
||||
|
||||
get_text_content(file_path, file_id, tables_range, conn, cursor)
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
end = time.time()
|
||||
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
|
||||
if __name__ == "__main__":
|
||||
path = "/Users/zhengfei/Desktop/cb/002315-2023-nb-nb.pdf"
|
||||
# get_text_content(path,'111')
|
||||
# get_table_measure(path,'all','111')
|
||||
#print(pdf_data)
|
||||
# pdf_info = []
|
||||
tables_range = {}
|
||||
get_table_range(path, '5555', 'all', tables_range)
|
||||
|
||||
# sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
||||
|
||||
# pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
|
||||
# for table in pdf_tables:
|
||||
# print(table)#修改测试
|
|
@ -0,0 +1,785 @@
|
|||
import re
|
||||
import os,time
|
||||
from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP
|
||||
import mysql.connector
|
||||
import utils
|
||||
from pymilvus import MilvusClient
|
||||
|
||||
import numpy as np
|
||||
from multiprocessing import Process
|
||||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
import db_service_word
|
||||
from zzb_logger import applog
|
||||
|
||||
|
||||
|
||||
'''
|
||||
已知发现问题:
|
||||
1.表格和文本提取错误,表格和文本内容在同一页,文本在前表格在后的,文本数据提取不出来
|
||||
2.大模型抽取错,抽取2023年营业收入:主营业务收入、分产品的营业收入、变动比例被错误抽取
|
||||
3.表格中的指标被抽取成文本中
|
||||
4.大模型抽取指标时,语义完全不同的指标被放一起,考虑用向量相似度来判断
|
||||
'''
|
||||
|
||||
# 数据处理流程
|
||||
# 1. get_table_range多进程获取所有表格及表格上下文,输出为一个完整的列表
|
||||
# 2. 单进程进行表格分页合并,输出一个新的表格对象数组
|
||||
# 3. 新表格对象数组多进程开始原来的解析指标流程
|
||||
|
||||
|
||||
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
|
||||
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
|
||||
MUILT_PATTERN = '调整前'
|
||||
#unit_pattern = re.compile(r'单位[:|:]?(百万元|千万元|亿元|万元|千元|元)')
|
||||
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
|
||||
#获取指标的表头信息
|
||||
def get_col_num_info(array,row_num,col_num,x,y):
|
||||
num_info=""
|
||||
for j in range(col_num):
|
||||
if len(str(array[x][j])) > 50:
|
||||
continue
|
||||
num_info += str(array[x][j])
|
||||
|
||||
return num_info.replace('%','')
|
||||
|
||||
#获取指标的表头信息
|
||||
def get_row_num_info(array,row_num,col_num,x,y):
|
||||
num_info=""
|
||||
|
||||
for i in range(row_num):
|
||||
if len(str(array[i][y])) > 50:
|
||||
continue
|
||||
num_info += str(array[i][y])
|
||||
|
||||
return num_info
|
||||
|
||||
def table_converter(table):
|
||||
table_string = ''
|
||||
# 遍历表格的每一行
|
||||
for row_num in range(len(table)):
|
||||
row = table[row_num]
|
||||
# 从warp的文字删除线路断路器
|
||||
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
|
||||
# 将表格转换为字符串,注意'|'、'\n'
|
||||
table_string+=(','.join(cleaned_row))
|
||||
# 删除最后一个换行符
|
||||
table_string = table_string[:-1]
|
||||
return table_string
|
||||
|
||||
# 检查第二列是否为中文字符的函数
|
||||
def is_chinese(s):
|
||||
return bool(re.search('[\u4e00-\u9fff]', s))
|
||||
|
||||
def check_table(arr):
|
||||
split_index = None
|
||||
for i in range(arr.shape[0]):
|
||||
# 过滤掉第一行
|
||||
if arr[i, 0] == "" and is_chinese(arr[i, 1]) and i > 1:
|
||||
split_index = i
|
||||
break
|
||||
if split_index is not None:
|
||||
arr1 = arr[:split_index]
|
||||
arr2 = arr[split_index:]
|
||||
return [arr1, arr2]
|
||||
else:
|
||||
return [arr]
|
||||
|
||||
def safe_process_array(func, arr):
|
||||
try:
|
||||
return func(arr)
|
||||
except Exception as e:
|
||||
print(f"这个函数出现了报错{func.__name__}: {e}")
|
||||
return arr # 返回原数组以便继续后续处理
|
||||
|
||||
|
||||
# 单独针对三季报的资产负债表识别合并问题
|
||||
def process_array(arr, years=['2022', '2023', '2024'], keyword='项目'):
|
||||
# 确保 row 有足够的列来存储分割后的数据
|
||||
def ensure_columns(row, num_columns):
|
||||
while len(row) < num_columns:
|
||||
row.append('')
|
||||
|
||||
def is_valid_header(header, years, keyword):
|
||||
header_text = header.lower() # 转小写以提高匹配的鲁棒性
|
||||
return any(year in header_text for year in years) and keyword in header_text
|
||||
|
||||
# 对字符串进行清理
|
||||
def clean_text(text):
|
||||
# 去除“年”和“月”相邻的空格
|
||||
text = re.sub(r'\s*(年|月)\s*', r'\1', text)
|
||||
# 去除“日”左侧相邻的空格
|
||||
text = re.sub(r'\s*日', '日', text)
|
||||
return text
|
||||
|
||||
# 将 numpy 数组转换为列表
|
||||
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
|
||||
|
||||
if len(arr[0]) == 1 and is_valid_header(arr[0][0], years, keyword):
|
||||
remaining_value = arr[0][0]
|
||||
|
||||
# 清理字符串
|
||||
remaining_value = clean_text(remaining_value)
|
||||
|
||||
parts = remaining_value.split()
|
||||
|
||||
ensure_columns(arr[0], len(parts))
|
||||
for i in range(len(parts)):
|
||||
arr[0][i] = parts[i]
|
||||
|
||||
header_columns = len(arr[0])
|
||||
|
||||
for i in range(1, len(arr)):
|
||||
if len(arr[i]) == 1:
|
||||
remaining_value = arr[i][0]
|
||||
parts = remaining_value.split()
|
||||
if len(parts) > header_columns:
|
||||
parts = parts[:header_columns]
|
||||
ensure_columns(arr[i], header_columns)
|
||||
for j in range(len(parts)):
|
||||
arr[i][j] = parts[j]
|
||||
# 如果分割出的值不足,填充空值
|
||||
if len(parts) < header_columns:
|
||||
for j in range(len(parts), header_columns):
|
||||
arr[i][j] = ''
|
||||
|
||||
return arr
|
||||
|
||||
|
||||
# 三季报中针对性修改,本报告期和年初至报告期末的两个上年同期进行区分
|
||||
def process_array_with_annual_comparison(arr, keywords=['本报告期', '年初至报告期末', '上年同期']):
|
||||
def contains_all_keywords(header, keywords):
|
||||
return all(keyword in header for keyword in keywords)
|
||||
|
||||
def split_and_replace_occurrences(header, target, replacement):
|
||||
# 找到所有 target 出现的位置
|
||||
indices = [i for i, x in enumerate(header) if x == target]
|
||||
if len(indices) > 1:
|
||||
split_index = len(indices) // 2
|
||||
for i in range(split_index):
|
||||
header[indices[i]] = replacement
|
||||
return header
|
||||
|
||||
# 将 numpy 数组转换为列表
|
||||
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
|
||||
|
||||
if len(arr) > 0 and len(arr[0]) > 0:
|
||||
first_row = arr[0]
|
||||
|
||||
if contains_all_keywords(first_row, keywords):
|
||||
# 将 "上年同期" 拆分并替换
|
||||
first_row = split_and_replace_occurrences(first_row, '上年同期', '三季报中无需识别的上年同期')
|
||||
arr[0] = first_row
|
||||
|
||||
return arr
|
||||
|
||||
|
||||
# 三季报的非经常损益的单独处理
|
||||
def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期'], target='计入当期损益的政府补助',
|
||||
replacement='非经常性损益'):
|
||||
# 检查第一行是否包含所有关键词
|
||||
def contains_all_keywords(header, keywords):
|
||||
# return all(keyword in header for keyword in keywords)
|
||||
return all(any(keyword in str(cell) for cell in header) for keyword in keywords)
|
||||
|
||||
# 检查第一列中是否存在目标文本
|
||||
def contains_target_in_first_column(arr, target):
|
||||
return any(target in str(item[0]) for item in arr)
|
||||
|
||||
# 替换第一列中的特定值
|
||||
def replace_in_first_column(arr, target, replacement):
|
||||
for i in range(len(arr)):
|
||||
if arr[i][0] == target:
|
||||
arr[i][0] = replacement
|
||||
return arr
|
||||
|
||||
# 将 numpy 数组转换为列表
|
||||
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
|
||||
|
||||
if len(arr) > 0 and len(arr[0]) > 0:
|
||||
first_row = arr[0]
|
||||
|
||||
# 检查第一行和第一列的条件
|
||||
if contains_all_keywords(first_row, keywords) and contains_target_in_first_column(arr, target):
|
||||
# 替换第一列中的 "合计"
|
||||
arr = replace_in_first_column(arr, '合计', replacement)
|
||||
|
||||
return arr
|
||||
|
||||
# 处理表格数据
|
||||
def process_table(file_id, tables):
|
||||
applog.info('Run task %s (%s)...' % (f'处理word文件中的table file_id:{file_id}', os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
|
||||
for t in tables:
|
||||
try:
|
||||
arr = np.array(t["data"])
|
||||
|
||||
arr = safe_process_array(process_array, arr) # 部分资产负债表合并问题
|
||||
arr = safe_process_array(process_array_with_annual_comparison, arr) # 复杂表格的优化"多个上年同期时处理"
|
||||
arr = safe_process_array(process_array_with_grants, arr) # 三季报的非经常损益
|
||||
|
||||
arr = np.char.replace(arr, ' ', '')
|
||||
arr = np.char.replace(arr, '\n', '')
|
||||
arr = np.char.replace(arr, ',', '')
|
||||
|
||||
arr_list = check_table(arr)
|
||||
|
||||
for a in arr_list:
|
||||
new_data = a.tolist() # 用于后面保存到数据库中
|
||||
new_data = utils.check_black_table_list(new_data)
|
||||
rows, cols = a.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
arr_str = ''.join([''.join(map(str, row)) for row in a])
|
||||
# 全量的数据先存入 word_parse_data表中
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'table',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "table",
|
||||
"data": new_data,
|
||||
}}, conn, cursor, "word_parse_data")
|
||||
|
||||
# 过滤掉不包含需抽取指标表格的文本
|
||||
matches = re.findall(STR_PATTERN, arr_str)
|
||||
pattern = re.findall(PATTERN, arr_str)
|
||||
muilt_pattern = re.findall(MUILT_PATTERN, arr_str)
|
||||
|
||||
if len(matches) > 0 and len(muilt_pattern) < 5:
|
||||
# if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern) < 5:
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'parse_table',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "table",
|
||||
"data": new_data,
|
||||
}}, conn, cursor,"word_parse_process")
|
||||
except Exception as e:
|
||||
applog.info(f'解析表格时出现了异常 {e} 内容为{t}')
|
||||
cursor.close()
|
||||
conn.close()
|
||||
end = time.time()
|
||||
applog.info('Task %s runs %0.2f seconds.' % (f'解析表格{file_id}', (end - start)))
|
||||
|
||||
def text_in_table(top, tables_range, page_num):
|
||||
if tables_range.get(page_num):
|
||||
for range in tables_range[page_num]:
|
||||
if top < range['top'] and top > range['buttom']:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_text_type(text: str):
|
||||
text = re.sub(r"\s", "", text)
|
||||
first_re = '年度报告'
|
||||
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
||||
|
||||
if re.search(first_re, text.strip()):
|
||||
return 'page_header'
|
||||
|
||||
if page_number_pattern.match(text.strip()):
|
||||
return 'page_footer'
|
||||
|
||||
if len(text) < 20 and text.endswith('页'):
|
||||
return 'page_footer'
|
||||
|
||||
return 'text'
|
||||
|
||||
def check_report_type(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
"""
|
||||
:return: 返回pdf文件中文本内容,不包括表格
|
||||
"""
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
record_select = cursor.fetchall()
|
||||
if record_select:
|
||||
report_type = record_select[0][0]
|
||||
report_year = record_select[0][1]
|
||||
cursor.close()
|
||||
conn.close()
|
||||
return int(report_type),report_year
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
# 通过text的index 获取最近的一个table的index,并校验中间text文本的长度和数量
|
||||
def get_next_table_index(text_index, texts, tables):
|
||||
try:
|
||||
for table in tables:
|
||||
if table["index"] > text_index and table["type"] == "table":
|
||||
table_index = table["index"]
|
||||
total_len = sum(len(texts.get(key).get("data").replace(" " ,"")) for key in range(text_index + 1, table_index))
|
||||
# 最近一个表格的索引 在10个以内
|
||||
if (table_index - text_index) < 10 and total_len < 50:
|
||||
# 在判断所有的字符串加起来有是否小于50个字
|
||||
return table_index
|
||||
else:
|
||||
return text_index
|
||||
except StopIteration:
|
||||
applog.error("Target not found")
|
||||
return text_index
|
||||
|
||||
|
||||
#处理文本数据
|
||||
def process_text_content(file_id,texts,tables,full_texts,type =0):
|
||||
applog.info('Run task %s (%s)...' % (f'处理word文件中的 text file_id:{file_id}', os.getpid()))
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
"""
|
||||
:return: 返回pdf文件中文本内容,不包括表格
|
||||
"""
|
||||
report_type, report_year = check_report_type(file_id)
|
||||
texts_dict = {t["index"]:t for t in full_texts}
|
||||
|
||||
query = "SELECT title_list,button_list FROM table_title_list WHERE report_year = %s"
|
||||
cursor_dict = conn.cursor(dictionary=True)
|
||||
cursor_dict.execute(query, (report_year,))
|
||||
result = cursor_dict.fetchone()
|
||||
title_list = result['title_list']
|
||||
button_list = result['button_list']
|
||||
|
||||
try:
|
||||
for t in texts:
|
||||
line_text = t["data"]
|
||||
line_text = re.sub(r"\s", "", line_text)
|
||||
line_text = re.sub(r":", ":", line_text)
|
||||
index = t["index"]
|
||||
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
||||
db_service_word.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': get_next_table_index(index,texts_dict,tables),
|
||||
'type': 'parent_com',
|
||||
}, conn, cursor)
|
||||
|
||||
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
|
||||
table_info = {}
|
||||
if (utils.check_table_title_black_list(line_text, title_list)
|
||||
or utils.check_table_title_black_list_button(line_text,button_list)):
|
||||
db_service_word.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': get_next_table_index(index,texts_dict,tables),
|
||||
'type': 'table_index',
|
||||
}, conn, cursor)
|
||||
if utils.check_table_title_black_list_measure(line_text):
|
||||
db_service_word.insert_measure_parser_info_measure({
|
||||
'file_id': file_id,
|
||||
'content': get_next_table_index(index, texts_dict,tables),
|
||||
'type': 'measure_index',
|
||||
}, conn, cursor, line_text)
|
||||
|
||||
|
||||
if re.findall(unit_pattern, line_text):
|
||||
# 为单位
|
||||
table_info = get_table_unit_info(file_id,line_text,t["index"],t["index"]+1)
|
||||
|
||||
db_service_word.insert_table_unit_info_v1(table_info,conn,cursor)
|
||||
|
||||
if utils.check_table_title_black_list_measure(line_text):
|
||||
db_service_word.insert_measure_parser_info_measure({
|
||||
'file_id': file_id,
|
||||
'content': f"{t['index']}_1",
|
||||
'type': 'measure_index',
|
||||
}, conn, cursor, line_text)
|
||||
|
||||
if not utils.pdf_text_flag(line_text):
|
||||
if utils.check_line_text(line_text):
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num' : t["index"],
|
||||
'page_count' : 100,
|
||||
'type' : 'parse_table',
|
||||
'content':{
|
||||
'page_num' : t["index"],
|
||||
'table_index' : t["index"],
|
||||
"type" : "text",
|
||||
'content' : line_text,
|
||||
}},conn,cursor,"word_parse_process")
|
||||
|
||||
# 给慎用词校验用
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'text',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "text",
|
||||
'content': line_text,
|
||||
}}, conn, cursor, "word_parse_data")
|
||||
|
||||
table_name = "word_text_info"
|
||||
if type == 1:
|
||||
table_name = "id_text_info"
|
||||
# 写入数据库 传入表名
|
||||
db_service_word.batch_insert_page_text({
|
||||
'file_id': file_id,
|
||||
'page_num' : t["index"],
|
||||
'text' : line_text
|
||||
},conn,cursor, table_name)
|
||||
except Exception as e:
|
||||
applog.error(f'文本处理异常{e}')
|
||||
|
||||
|
||||
|
||||
def get_table_unit_info(file_id,line_text,page_num,table_index):
|
||||
table_info = {}
|
||||
table_info['file_id'] = file_id
|
||||
match = unit_pattern.search(line_text)
|
||||
if match:
|
||||
unit = match.group(2)
|
||||
table_info['unit'] = unit
|
||||
|
||||
table_info['page_num'] = page_num
|
||||
table_info['table_index'] = table_index
|
||||
|
||||
return table_info
|
||||
|
||||
|
||||
def get_table_text_info(file_id,line_text,page_num,table_index):
|
||||
table_info = {}
|
||||
table_info['file_id'] = file_id
|
||||
table_info['text_info'] = line_text
|
||||
table_info['page_num'] = page_num
|
||||
table_info['table_index'] = table_index
|
||||
|
||||
return table_info
|
||||
|
||||
# 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
||||
def get_table_measure(file_id, word_tables, record_range):
|
||||
"""
|
||||
:return: pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
||||
"""
|
||||
try:
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
conn_app = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
record_select = cursor.fetchall()
|
||||
report_type = record_select[0][0]
|
||||
report_year = record_select[0][1]
|
||||
|
||||
client = MilvusClient(
|
||||
uri= MILVUS_CLIENT
|
||||
)
|
||||
applog.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
t = word_tables[index]
|
||||
measure_obj =[]
|
||||
data_dict = {}
|
||||
measure_list = []
|
||||
try:
|
||||
arr = np.array(t['data'])
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
|
||||
row_num , col_num = -1 , -1
|
||||
|
||||
# 使用嵌套循环遍历数组,获取第一个数值位置
|
||||
for i in range(rows):
|
||||
for j in range(cols):
|
||||
if j == 0 or i == 0:#防止第一列识别出数字
|
||||
continue
|
||||
measure_value_config = str(arr[i, j]).replace('(','').replace(')','')
|
||||
|
||||
|
||||
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config):
|
||||
if j == cols-1:
|
||||
row_num, col_num = i, j
|
||||
break
|
||||
elif (re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config)
|
||||
or measure_value_config == '-'):
|
||||
row_num, col_num = i, j
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
# 遍历数值二维数组,转成带语义的指标
|
||||
if row_num != -1 and col_num != -1:
|
||||
for i in range(row_num,arr.shape[0]):
|
||||
for j in range(col_num,arr.shape[1]):
|
||||
measure_value = str(arr[i, j]).replace('%','').replace('(','-').replace(')','')
|
||||
if measure_value == '-' or measure_value == '' or len(measure_value) > 20:
|
||||
continue
|
||||
else:
|
||||
row_num_info = get_row_num_info(arr,row_num,col_num,i,j)
|
||||
col_num_info = get_col_num_info(arr,row_num,col_num,i,j)
|
||||
|
||||
#如果上表头为空则认为是被截断,除了研发投入特殊处理其它过滤
|
||||
if row_num_info in ('','-',')',')'):
|
||||
continue
|
||||
|
||||
#特殊处理非经常性损益合计和非经常性损益净额同时出现时保留净额
|
||||
if col_num_info == '非经常性损益合计':
|
||||
continue
|
||||
|
||||
if utils.check_pdf_measure_black_list(f"{col_num_info}{row_num_info}"):
|
||||
continue
|
||||
|
||||
#去掉没有周期的指标
|
||||
if utils.check_pdf_measure(f"{col_num_info}{row_num_info}"):
|
||||
continue
|
||||
|
||||
#判断上表头和左表头周期是否一致,不一致过滤
|
||||
row_period = utils.get_period_type_other(row_num_info, report_year)
|
||||
col_period = utils.get_period_type_other(col_num_info, report_year)
|
||||
if(row_period != col_period and row_period != 'c_n' and col_period != 'c_n'):
|
||||
continue
|
||||
units_mapping = {
|
||||
"百万元": "百万元",
|
||||
"千万元": "千万元",
|
||||
"亿元": "亿元",
|
||||
"万元": "万元",
|
||||
"千元": "千元",
|
||||
"元": "元",
|
||||
"元/股": "元"
|
||||
}
|
||||
row_num_info = row_num_info.replace('%','增减')
|
||||
#num_info = f"{col_num_info}{row_num_info}".replace('()','').replace('加:','').replace('减:','').replace('%','')
|
||||
num_info = utils.get_clean_text(f"{row_num_info}{col_num_info}")
|
||||
num_info_bak = utils.get_clean_text(f"{col_num_info}{row_num_info}")
|
||||
measure_unit = ''
|
||||
#"%": "同期增减"
|
||||
combined_info = f"{row_num_info} {col_num_info}"
|
||||
# for unit in units_mapping:
|
||||
# if unit in row_num_info:
|
||||
# measure_unit = units_mapping[unit]
|
||||
# break
|
||||
if utils.get_percent_flag(row_num_info) == '1':
|
||||
measure_unit = ''
|
||||
else:
|
||||
for unit in units_mapping:
|
||||
if re.search(rf'\(\s*{unit}(\s*人民币)?\s*\)|\(\s*{unit}(\s*人民币)?\s*\)', combined_info) or (re.search(rf'{unit}', combined_info) and any(re.search('单位', item) for item in arr[0])):
|
||||
measure_unit = units_mapping[unit]
|
||||
break
|
||||
measure_list.append({
|
||||
'measure_name': num_info,
|
||||
'measure_value': measure_value,
|
||||
'measure_unit':measure_unit,
|
||||
})
|
||||
measure_list.append({
|
||||
'measure_name': num_info_bak,
|
||||
'measure_value': measure_value,
|
||||
'measure_unit':measure_unit,
|
||||
})
|
||||
|
||||
if not redis_client.exists(f'parsed_measure_count_{file_id}'):
|
||||
redis_client.set(f'parsed_measure_count_{file_id}', 0)
|
||||
|
||||
redis_client.incr(f'parsed_measure_count_{file_id}')
|
||||
|
||||
if len(measure_list) > 0:
|
||||
data_dict["measure_list"] = measure_list
|
||||
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
|
||||
data_dict['file_id'] = file_id
|
||||
measure_obj.append(data_dict)
|
||||
db_service_word.insert_measure_data_to_milvus(client,measure_obj,cursor_app,conn_app)
|
||||
except Exception as e:
|
||||
applog.error(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
|
||||
applog.error(f"错误是:{e}")
|
||||
end = time.time()
|
||||
applog.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
|
||||
except Exception as e:
|
||||
applog.error(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
t = word_tables[index]
|
||||
try:
|
||||
arr = np.array(t['data'])
|
||||
except Exception as e:
|
||||
applog.error(f'这个错误是{e}的arr的值是{arr}')
|
||||
|
||||
|
||||
finally:
|
||||
redis_client.close()
|
||||
client.close()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
|
||||
|
||||
#指标归一化处理
|
||||
|
||||
def update_measure_data(file_id,file_path,parent_table_pages):
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
# #通过向量查询指标
|
||||
conn_app = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
applog.info(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service_word.delete_to_run(conn,cursor,file_id)
|
||||
db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
|
||||
|
||||
# #指标归一化处理
|
||||
db_service_word.update_ori_measure(conn,cursor,file_id)
|
||||
# db_service.delete_database(conn_app,cursor_app,file_id)
|
||||
cursor.close()
|
||||
conn.close()
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
|
||||
def merge_consecutive_arrays(word_info):
|
||||
merged_objects = []
|
||||
|
||||
for info_obj in word_info:
|
||||
try:
|
||||
if info_obj['type'] == 'table':
|
||||
# 如果对象是表格,将其元素添加到临时列表中
|
||||
merged_objects.append(info_obj)
|
||||
except Exception as e:
|
||||
applog.error(f"解析数据错误: {e}")
|
||||
|
||||
return merged_objects
|
||||
|
||||
def merge_consecutive_arrays_v1(pdf_info):
|
||||
merged_objects = []
|
||||
temp_array = {}
|
||||
|
||||
def is_same_dimension(data1, data2):
|
||||
# 检查两个表的每行长度是否相同
|
||||
if len(data1) != len(data2):
|
||||
return False
|
||||
return all(len(row1) == len(row2) for row1, row2 in zip(data1, data2))
|
||||
|
||||
for info_obj in pdf_info:
|
||||
try:
|
||||
if info_obj['type'] == 'table':
|
||||
if not temp_array:
|
||||
# 如果临时列表为空,则初始化临时列表
|
||||
temp_array = info_obj
|
||||
else:
|
||||
# 检查当前表与临时列表中的表是否同维度
|
||||
if is_same_dimension(temp_array['data'], info_obj['data']):
|
||||
# 如果是同维度,则合并数据
|
||||
temp_array['data'].extend(info_obj['data'])
|
||||
else:
|
||||
# 如果不是同维度,将现有临时列表添加到结果中,并重置临时列表
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = info_obj
|
||||
else:
|
||||
# 如果对象不是表格,检查临时列表是否非空
|
||||
if temp_array:
|
||||
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
applog.error(f"解析数据错误: {e}")
|
||||
|
||||
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
|
||||
if temp_array:
|
||||
merged_objects.append(temp_array)
|
||||
|
||||
return merged_objects
|
||||
def start_table_measure_job(file_id):
|
||||
conn_app = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
|
||||
select_process_query = '''
|
||||
select DISTINCT content from word_parse_process WHERE file_id = '{file_id}' and type='parse_table' order by page_num
|
||||
'''.format(file_id=file_id)
|
||||
cursor_app.execute(select_process_query)
|
||||
records = cursor_app.fetchall()
|
||||
word_info = []
|
||||
for record in records:
|
||||
word_info.append(eval(record[0]))
|
||||
|
||||
# 获取table 数据
|
||||
word_tables = merge_consecutive_arrays(word_info)
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
|
||||
redis_client.set(f'measure_count_{file_id}', len(word_tables))
|
||||
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
redis_client.close()
|
||||
|
||||
records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
# get_table_measure(file_id,word_tables,record_range,)
|
||||
p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,4 @@
|
|||
/Users/zhengfei/opt/anaconda3/envs/py310/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
|
||||
warnings.warn('resource_tracker: There appear to be %d '
|
||||
/Users/zhengfei/opt/anaconda3/envs/py310/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown
|
||||
warnings.warn('resource_tracker: There appear to be %d '
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,57 @@
|
|||
from docx import Document
|
||||
from pymilvus import MilvusClient
|
||||
import requests
|
||||
import json,time,os
|
||||
|
||||
directory_path = '/Users/zhengfei/Desktop/大模型/书籍/第二批'
|
||||
|
||||
|
||||
client = MilvusClient(
|
||||
uri= 'http://114.55.128.195:19530'
|
||||
)
|
||||
# 遍历目录
|
||||
for filename in os.listdir(directory_path):
|
||||
# 构建完整的文件路径
|
||||
try:
|
||||
file_path = os.path.join(directory_path, filename)
|
||||
|
||||
# 加载Word文档
|
||||
print(file_path)
|
||||
doc = Document(file_path)
|
||||
|
||||
text = ''
|
||||
# 读取文档中的所有段落
|
||||
i=0
|
||||
data = []
|
||||
# for para in doc.paragraphs:
|
||||
for num in range(200, len(doc.paragraphs)-200):
|
||||
# 将段落文本添加到当前段落片段中
|
||||
try:
|
||||
text += doc.paragraphs[num].text
|
||||
# 当当前段落片段长度超过500时,将其添加到数组中,并重置当前段落片段
|
||||
if len(text) > 500:
|
||||
i += 1
|
||||
response = requests.post("http://114.55.128.195:8001/get_embedding/", json={"text": [text]}, headers={"Content-Type": "application/json"})
|
||||
res_json = json.loads(response.text)
|
||||
if res_json["code"] == 200:
|
||||
vector = res_json["data"][0]
|
||||
|
||||
measure_data = {}
|
||||
measure_data['vector'] = vector
|
||||
measure_data['text'] = text
|
||||
measure_data['source'] = '/projects/ai_chat/knowledge_base/ydkf/content/骨盆和骶骼关节功能解剖 手法操作指南 详解局部解剖和功能 涵盖评估分析 运动 肌肉能量技术及替代_14533413.docx'
|
||||
data.append(measure_data)
|
||||
text = ''
|
||||
|
||||
if (i > 20 or num == len(doc.paragraphs)-200-1):
|
||||
res = client.insert(
|
||||
collection_name="ydkf",
|
||||
data=data
|
||||
)
|
||||
i = 0
|
||||
data = []
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
|
@ -0,0 +1,269 @@
|
|||
from docx import Document
|
||||
import json
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from lxml import etree
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
RESULT_TYPE_TEXT = 'text'
|
||||
RESULT_TYPE_TABLE = 'table'
|
||||
|
||||
def build_result(result_type, index, data):
|
||||
return {
|
||||
'type': result_type,
|
||||
'index': index,
|
||||
'data': data
|
||||
}
|
||||
|
||||
def build_catalog_result(index, depth, data):
|
||||
return {
|
||||
'index': index,
|
||||
'depth': depth,
|
||||
'data': data
|
||||
}
|
||||
|
||||
# 解析docx文件中的XML内容
|
||||
def get_xml_content(docx_filename, xml_filename):
|
||||
with zipfile.ZipFile(docx_filename) as z:
|
||||
return z.read(xml_filename)
|
||||
|
||||
def parse_paragraph(paragraph, index, namespaces):
|
||||
paragraph_text = paragraph.text.strip() if paragraph else ''
|
||||
if paragraph_text:
|
||||
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
|
||||
return None
|
||||
|
||||
def parse_table(table, index):
|
||||
table_data = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text for cell in row.cells]
|
||||
table_data.append(row_data)
|
||||
return build_result(RESULT_TYPE_TABLE, index, table_data)
|
||||
|
||||
def parse_paragraph_element(paragraph_element, index, namespaces):
|
||||
paragraph_xml = etree.fromstring(paragraph_element.xml)
|
||||
paragraph_text = ''.join(paragraph_xml.xpath('//w:t/text()', namespaces=namespaces)).strip()
|
||||
if paragraph_text:
|
||||
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
|
||||
return None
|
||||
|
||||
def parse_table_element(table_element, index, namespaces):
|
||||
table_xml = etree.fromstring(table_element.xml)
|
||||
table_data = []
|
||||
for row in table_xml.xpath('//w:tr', namespaces=namespaces):
|
||||
row_data = []
|
||||
for cell in row.xpath('./w:tc | ./w:sdt', namespaces=namespaces):
|
||||
cell_text = ''.join(cell.xpath('.//w:t/text()', namespaces=namespaces)).strip()
|
||||
grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
|
||||
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
|
||||
if grid_span > 1:
|
||||
row_data.extend([cell_text] * grid_span)
|
||||
else:
|
||||
row_data.append(cell_text)
|
||||
table_data.append(row_data)
|
||||
return build_result(RESULT_TYPE_TABLE, index, table_data)
|
||||
|
||||
def add_to_catalog(element_xml, index, catalog_content, namespaces, paragraph_text, heading_styles):
|
||||
p_element = etree.fromstring(element_xml)
|
||||
# outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
|
||||
# if outlineLvl:
|
||||
# level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
|
||||
# catalog_content.append(build_catalog_result(index, level, paragraph_text))
|
||||
level = is_heading_paragraph(p_element, heading_styles, namespaces)
|
||||
if level != -1:
|
||||
catalog_content.append(build_catalog_result(index, level, paragraph_text))
|
||||
# 检查段落是否为标题样式
|
||||
def is_heading_paragraph(paragraph, heading_styles, namespaces):
|
||||
pPr = paragraph.find('.//w:pPr', namespaces=namespaces)
|
||||
if pPr is not None:
|
||||
pStyle = pPr.find('.//w:pStyle', namespaces=namespaces)
|
||||
pOutLineLvl = pPr.find('.//w:outlineLvl', namespaces=namespaces)
|
||||
if pStyle is not None:
|
||||
style_val = pStyle.get(f"{{{namespaces['w']}}}val")
|
||||
if style_val.isdigit():
|
||||
return int(style_val)
|
||||
if pOutLineLvl is not None:
|
||||
outLineLvl_val = pOutLineLvl.get(f"{{{namespaces['w']}}}val")
|
||||
if outLineLvl_val.isdigit():
|
||||
return int(outLineLvl_val) + 1
|
||||
# if pStyle is not None and pStyle.get(ns['w'] + 'val') in heading_styles:
|
||||
# if style_val > 0:
|
||||
# return True
|
||||
return -1
|
||||
|
||||
def get_paragraph_text(paragraph_element, namespaces):
|
||||
paragraph_text = ''
|
||||
for run in paragraph_element.findall('.//w:r', namespaces=namespaces):
|
||||
for text in run.findall('.//w:t', namespaces=namespaces):
|
||||
paragraph_text += text.text if text.text is not None else ''
|
||||
return paragraph_text
|
||||
|
||||
def add_to_catalog_paragraph(text, index, catalog_content, namespaces):
|
||||
# 添加段落到目录
|
||||
catalog_content.append(build_catalog_result(index, 1, text)) # 假设默认级别为1
|
||||
|
||||
def parse_sdt_catalog(sdt_element, catalog_content, index, namespaces):
|
||||
sdt_content = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
|
||||
if sdt_content is not None:
|
||||
for child in sdt_content:
|
||||
if child.tag.endswith('p'): # 内容控件中的段落
|
||||
paragraph_text = get_paragraph_text(child, namespaces)
|
||||
if paragraph_text.strip(): # 检查文本是否为空
|
||||
add_to_catalog_paragraph(paragraph_text, index, catalog_content, namespaces)
|
||||
index += 1 # 更新索引
|
||||
elif child.tag.endswith('tbl'): # 内容控件中的表格
|
||||
# 处理表格内容(如果需要)
|
||||
pass
|
||||
elif child.tag.endswith('sdt'): # 嵌套的内容控件
|
||||
index = parse_sdt_catalog(child, catalog_content, index, namespaces) # 递归解析嵌套的内容控件
|
||||
return index
|
||||
|
||||
def parse_docx(docx_path):
|
||||
try:
|
||||
document = Document(docx_path)
|
||||
styles_xml = get_xml_content(docx_path, 'word/styles.xml')
|
||||
except Exception as e:
|
||||
print(f"Error loading document: {e}")
|
||||
return None, None
|
||||
|
||||
doc_content = [] # 内容(文本+表格)
|
||||
catalog_content = [] # 目录
|
||||
current_index = 1 # 维护全局的 index 变量
|
||||
paragraph_index = 0
|
||||
table_index = 0
|
||||
# 获取整个文档的XML内容
|
||||
xml_root = document.part.element
|
||||
namespaces = xml_root.nsmap
|
||||
|
||||
# 获取所有标题样式
|
||||
styles_root = etree.fromstring(styles_xml)
|
||||
heading_styles = set()
|
||||
for style in styles_root.xpath('//w:style', namespaces=namespaces):
|
||||
style_type = style.get(namespaces['w'] + 'type')
|
||||
if style_type == 'paragraph' and style.get(namespaces['w'] + 'styleId').startswith('Heading'):
|
||||
heading_styles.add(style.get(namespaces['w'] + 'styleId'))
|
||||
|
||||
# 遍历文档中的所有元素
|
||||
for i, element in enumerate(document.element.body):
|
||||
if isinstance(element, CT_P): # 段落
|
||||
paragraph_result = parse_paragraph_element(element, current_index, namespaces)
|
||||
if paragraph_result:
|
||||
doc_content.append(paragraph_result)
|
||||
# 判断是否为目录,是就插入目录内容
|
||||
paragraph = document.paragraphs[paragraph_index]
|
||||
add_to_catalog(paragraph._element.xml, current_index, catalog_content, namespaces, paragraph.text, heading_styles)
|
||||
current_index += 1 # 更新 index
|
||||
paragraph_index += 1
|
||||
elif isinstance(element, CT_Tbl): # 表格
|
||||
table_result = parse_table_element(element, current_index, namespaces)
|
||||
if table_result:
|
||||
doc_content.append(table_result)
|
||||
current_index += 1 # 更新 index
|
||||
table_index += 1
|
||||
elif element.tag.endswith('sdt'): # 内容控件
|
||||
current_index = parse_sdt(element, doc_content, current_index, namespaces, catalog_content, heading_styles) # 更新索引
|
||||
|
||||
return json.dumps(doc_content, indent=4, ensure_ascii=False), json.dumps(catalog_content, indent=4, ensure_ascii=False)
|
||||
|
||||
|
||||
|
||||
def parse_sdt(sdt_element, doc_content, current_index, namespaces, catalog_content, heading_styles):
|
||||
sdtContent = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
|
||||
if sdtContent is not None:
|
||||
for child in sdtContent:
|
||||
if child.tag.endswith('p'): # 内容控件中的段落
|
||||
paragraph_text = ''
|
||||
for run in child.findall('.//w:r', namespaces=namespaces):
|
||||
for text in run.findall('.//w:t', namespaces=namespaces):
|
||||
paragraph_text += text.text if text.text is not None else ''
|
||||
if paragraph_text.strip(): # 检查文本是否为空
|
||||
doc_content.append(build_result(RESULT_TYPE_TEXT, current_index, paragraph_text.strip()))
|
||||
# 判断是否为目录,是就插入目录内容
|
||||
add_to_catalog(child.xml, current_index, catalog_content, namespaces, paragraph_text, heading_styles)
|
||||
current_index += 1 # 更新索引
|
||||
elif child.tag.endswith('tbl'): # 内容控件中的表格
|
||||
table_data = []
|
||||
merged_cells = {} # 用于记录跨行单元格的信息
|
||||
for row_idx, row in enumerate(child.findall('.//w:tr', namespaces=namespaces)):
|
||||
row_data = []
|
||||
for col_idx, cell in enumerate(row.findall('.//w:tc', namespaces=namespaces)):
|
||||
cell_text = ''
|
||||
for run in cell.findall('.//w:r', namespaces=namespaces):
|
||||
for text in run.findall('.//w:t', namespaces=namespaces):
|
||||
cell_text += text.text if text.text is not None else ''
|
||||
|
||||
# 检查单元格是否跨列
|
||||
grid_span_xpath = etree.XPath('.//w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
|
||||
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
|
||||
if grid_span > 1:
|
||||
row_data.extend([cell_text.strip()] * grid_span)
|
||||
else:
|
||||
row_data.append(cell_text.strip())
|
||||
|
||||
# 检查单元格是否跨行
|
||||
v_merge_xpath = etree.XPath('.//w:tcPr/w:vMerge/@w:val', namespaces=namespaces)
|
||||
v_merge = v_merge_xpath(cell)
|
||||
if v_merge and v_merge[0] == 'restart':
|
||||
merged_cells[(row_idx, col_idx)] = (int(grid_span), 1)
|
||||
elif v_merge and v_merge[0] == 'continue':
|
||||
if (row_idx - 1, col_idx) in merged_cells:
|
||||
merged_cells[(row_idx - 1, col_idx)] = (merged_cells[(row_idx - 1, col_idx)][0], merged_cells[(row_idx - 1, col_idx)][1] + 1)
|
||||
# 跨行单元格不需要再次添加到 row_data 中
|
||||
else:
|
||||
# 只有非跨行单元格才需要添加到 row_data 中
|
||||
pass
|
||||
|
||||
# 处理跨行单元格
|
||||
for (r, c), (col_span, row_span) in list(merged_cells.items()):
|
||||
if r < row_idx:
|
||||
for i in range(row_span):
|
||||
if r + i == row_idx:
|
||||
row_data[c:c] = [row_data[c]] * (col_span - 1)
|
||||
break
|
||||
if r + row_span - 1 == row_idx:
|
||||
del merged_cells[(r, c)]
|
||||
|
||||
table_data.append(row_data)
|
||||
if table_data: # 检查表格数据是否为空
|
||||
doc_content.append(build_result(RESULT_TYPE_TABLE, current_index, table_data))
|
||||
current_index += 1 # 更新索引
|
||||
elif child.tag.endswith('sdt'): # 嵌套的内容控件
|
||||
current_index = parse_sdt(child, doc_content, current_index, namespaces, catalog_content, heading_styles) # 递归解析嵌套的内容控件
|
||||
return current_index # 返回更新后的索引
|
||||
|
||||
def split_text_table(json_data):
|
||||
# 分组
|
||||
text_elements = [element for element in json_data if element['type'] == 'text']
|
||||
table_elements = [element for element in json_data if element['type'] == 'table']
|
||||
|
||||
# 转换为JSON字符串
|
||||
text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
|
||||
table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
|
||||
|
||||
return text_elements_json, table_elements_json
|
||||
|
||||
def append_to_file(file_path, text):
|
||||
try:
|
||||
with open(file_path, 'a', encoding='utf-8') as file:
|
||||
file.write(text + '\n')
|
||||
except Exception as e:
|
||||
print(f"Error writing to file: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
current_directory = os.getcwd()
|
||||
docx_relative_path = '101.docx'
|
||||
file_relative_path = 'file\\docx\\test1.txt'
|
||||
docx_path = os.path.join(current_directory, docx_relative_path)
|
||||
file_path = os.path.join(current_directory, file_relative_path)
|
||||
try:
|
||||
parsed_content, catalog_content = parse_docx(docx_path)
|
||||
if parsed_content and catalog_content:
|
||||
json_parsed_content = json.loads(parsed_content)
|
||||
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
|
||||
|
||||
append_to_file(file_path, text_elements_json)
|
||||
append_to_file(file_path, table_elements_json)
|
||||
append_to_file(file_path, catalog_content)
|
||||
except Exception as e:
|
||||
print(f"Error parse_docx: {e}")
|
|
@ -0,0 +1,934 @@
|
|||
import camelot
|
||||
import re
|
||||
from multiprocessing import Pool
|
||||
import os, time, random
|
||||
import json
|
||||
from config_p import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT,MYSQL_HOST_APP,MYSQL_USER_APP,MYSQL_PASSWORD_APP,MYSQL_DB_APP
|
||||
from datetime import datetime
|
||||
# 读取PDF
|
||||
import PyPDF2
|
||||
# 分析PDF的layout,提取文本
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextBoxHorizontal
|
||||
import pdfplumber
|
||||
import mysql.connector
|
||||
import utils
|
||||
from pymilvus import MilvusClient
|
||||
import llm_service
|
||||
import db_service
|
||||
import pdf_title
|
||||
import numpy as np
|
||||
from multiprocessing import Process
|
||||
from config_p import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
|
||||
|
||||
|
||||
'''
|
||||
已知发现问题:
|
||||
1.表格和文本提取错误,表格和文本内容在同一页,文本在前表格在后的,文本数据提取不出来
|
||||
2.大模型抽取错,抽取2023年营业收入:主营业务收入、分产品的营业收入、变动比例被错误抽取
|
||||
3.表格中的指标被抽取成文本中
|
||||
4.大模型抽取指标时,语义完全不同的指标被放一起,考虑用向量相似度来判断
|
||||
'''
|
||||
|
||||
# 数据处理流程
|
||||
# 1. get_table_range多进程获取所有表格及表格上下文,输出为一个完整的列表
|
||||
# 2. 单进程进行表格分页合并,输出一个新的表格对象数组
|
||||
# 3. 新表格对象数组多进程开始原来的解析指标流程
|
||||
|
||||
|
||||
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
|
||||
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
|
||||
MUILT_PATTERN = '调整前'
|
||||
#unit_pattern = re.compile(r'单位[:|:]?(百万元|千万元|亿元|万元|千元|元)')
|
||||
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
|
||||
#获取指标的表头信息
|
||||
def get_col_num_info(array,row_num,col_num,x,y):
|
||||
num_info=""
|
||||
for j in range(col_num):
|
||||
if len(str(array[x][j])) > 50:
|
||||
continue
|
||||
num_info += str(array[x][j])
|
||||
|
||||
return num_info.replace('%','')
|
||||
|
||||
#获取指标的表头信息
|
||||
def get_row_num_info(array,row_num,col_num,x,y):
|
||||
num_info=""
|
||||
|
||||
for i in range(row_num):
|
||||
if len(str(array[i][y])) > 50:
|
||||
continue
|
||||
num_info += str(array[i][y])
|
||||
|
||||
return num_info
|
||||
|
||||
def table_converter(table):
|
||||
table_string = ''
|
||||
# 遍历表格的每一行
|
||||
for row_num in range(len(table)):
|
||||
row = table[row_num]
|
||||
# 从warp的文字删除线路断路器
|
||||
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
|
||||
# 将表格转换为字符串,注意'|'、'\n'
|
||||
table_string+=(','.join(cleaned_row))
|
||||
# 删除最后一个换行符
|
||||
table_string = table_string[:-1]
|
||||
return table_string
|
||||
|
||||
def get_table_range(file_path, file_id, pages, tables_range):
|
||||
|
||||
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
conn_app = mysql.connector.connect(
|
||||
host= MYSQL_HOST_APP,
|
||||
user= MYSQL_USER_APP,
|
||||
password= MYSQL_PASSWORD_APP,
|
||||
database= MYSQL_DB_APP
|
||||
)
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
try:
|
||||
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
|
||||
for t in tables:
|
||||
|
||||
top = t._bbox[3]
|
||||
buttom = t._bbox[1]
|
||||
page_num = int(t.page)
|
||||
table_index = int(t.order)
|
||||
arr = np.array(t.data)
|
||||
if len(arr[0]) == 6 and arr[0][0]== "项目" and arr[0][1] == '' and '2022' in arr[0][2] and '2021' in arr[0][2]:
|
||||
remaining_value = arr[0][2]#initial_value.replace("项目", "", 1)
|
||||
split_index = len(remaining_value) // 2
|
||||
arr[0][1] = remaining_value[:split_index]
|
||||
arr[0][2] = remaining_value[split_index:]
|
||||
if len(arr[0]) == 4 and all(value == arr[0][0] for value in arr[0]) and all("项目" in arr[0][0] and "附注" in arr[0][0] for value in arr[0]):
|
||||
initial_value = arr[0][0]
|
||||
project_value = "项目"
|
||||
note_value = "附注"
|
||||
remaining_value = initial_value.replace("项目", "", 1).replace("附注", "", 1)
|
||||
split_index = len(remaining_value) // 2
|
||||
first_half = remaining_value[:split_index]
|
||||
second_half = remaining_value[split_index:]
|
||||
# 判断 "项目" 在 original_value 中的位置
|
||||
if "项目" in initial_value and first_half in initial_value:
|
||||
project_index = initial_value.index("项目")
|
||||
year_index = initial_value.index(first_half)
|
||||
|
||||
# 判断 "项目" 是否在 first_half 的前面
|
||||
if project_index > year_index:
|
||||
first_half, second_half = second_half, first_half
|
||||
arr[0] = [project_value, note_value, first_half, second_half]
|
||||
if len(arr[0]) == 3 and all(value == arr[0][0] for value in arr[0]) and all("项目" in arr[0][0] for value in arr[0]):
|
||||
initial_value = arr[0][0]
|
||||
project_value = "项目"
|
||||
#note_value = "附注"
|
||||
remaining_value = initial_value.replace("项目", "", 1)
|
||||
split_index = len(remaining_value) // 2
|
||||
first_half = remaining_value[:split_index]
|
||||
second_half = remaining_value[split_index:]
|
||||
arr[0] = [project_value, first_half, second_half]
|
||||
#for i in range(len(arr[0])):
|
||||
#if arr[0][i] == arr[1][i] and len(arr[0][i])<5:
|
||||
#print(f'{arr[0][i]}')
|
||||
#arr[1][i] = ''
|
||||
#保留camelot中的空格,在这里依据空格进行手动表格拆分
|
||||
#for line in arr:
|
||||
for line in arr:
|
||||
if not line[0].replace('.', '', 1).isdigit() and any(line[i] == line[i+1] and ' ' in line[i] for i in range(1, len(line) - 1)):
|
||||
for i in range(1, len(line) - 1):
|
||||
if line[i] == line[i+1] and ' ' in line[i]:
|
||||
split_value = line[i]
|
||||
split_parts = split_value.split(' ', 1) # 使用 split 方法进行分割
|
||||
if len(split_parts) == 2: # 确保确实进行了分割
|
||||
first_half, second_half = split_parts
|
||||
line[i] = first_half
|
||||
line[i+1] = second_half
|
||||
break
|
||||
|
||||
#处理完之后保证arr中不再存在空格
|
||||
#arr = [[item.rieplace(' ', '') for item in line] for line in arr]
|
||||
arr = np.char.replace(arr, ' ', '')
|
||||
|
||||
#这里是防止出现表格左右拼接的情况
|
||||
first_row = arr[0]
|
||||
if len(first_row) % 2 == 0 and all(cell.strip() for cell in first_row):
|
||||
mid_point = len(first_row) // 2
|
||||
if np.array_equal(first_row[:mid_point], first_row[mid_point:]):
|
||||
new_arr = []
|
||||
for i in range(mid_point):
|
||||
new_row = np.concatenate([arr[:, i], arr[:, i + mid_point]])
|
||||
new_arr.append(new_row)
|
||||
arr = np.array(new_arr).T
|
||||
#这里开始对无效的表头进行处理
|
||||
try:
|
||||
invalid_headers = ["上年年末余额"]
|
||||
non_empty_values = [value for value in first_row if value]#要求就是首行除了空值外的值都必须是一致的
|
||||
if len(set(non_empty_values)) == 1 and non_empty_values[0] in invalid_headers:
|
||||
arr[0] = ["表头不合规的表格"] * len(first_row)
|
||||
except Exception as e:
|
||||
print(f'在识别表头是否合规时出现了报错:{e}')
|
||||
#这里是防止出现'2023年度2022年度'camelot识别错误
|
||||
if not arr[0][0].replace('.', '', 1).isdigit() and any(arr[0][i] == arr[0][i+1] and '2023' in arr[0][i] and '2022' in arr[0][i] for i in range(1, len(arr[0])-1)):
|
||||
for i in range(1, len(arr[0])-1):
|
||||
if arr[0][i] == arr[0][i+1] and '2023' in arr[0][i] and '2022' in arr[0][i]:
|
||||
split_value = arr[0][i]
|
||||
split_index = len(split_value) // 2
|
||||
first_half = split_value[:split_index]
|
||||
second_half = split_value[split_index:]
|
||||
arr[0][i] = first_half
|
||||
arr[0][i+1] = second_half
|
||||
break
|
||||
#防止2023与2022同时出现
|
||||
if not arr[0][0].replace('.', '', 1).isdigit():
|
||||
# 遍历第一行的值
|
||||
for i in range(1, len(arr[0]) - 1):
|
||||
# 检查相邻的两个值是否同时包含 '2023' 和 '2022'(且 '2023' 在 '2022' 之前)
|
||||
if (('2023' in arr[0][i] and '2022' in arr[0][i+1]) and
|
||||
(arr[0][i].index('2023') < arr[0][i+1].index('2022'))):
|
||||
# 更新这两个值
|
||||
arr[0][i] = '2023年'
|
||||
arr[0][i+1] = '2022年'
|
||||
break
|
||||
#这里开始对可能解析错误的值做判断:
|
||||
for i, row in enumerate(arr):
|
||||
if len(row) >= 4:
|
||||
# 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
|
||||
if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
|
||||
split_values = row[2].split('.')
|
||||
# 确保可以正确拆分成三个数值
|
||||
if len(split_values) == 4:
|
||||
new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
|
||||
new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
|
||||
new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
|
||||
row[1] = new_value1
|
||||
row[2] = new_value2
|
||||
row[3] = new_value3
|
||||
#检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
|
||||
if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
|
||||
split_value_3 = row[2].split('.')
|
||||
split_value_5 = row[4].split('.')
|
||||
|
||||
if len(split_value_3) == 3:
|
||||
new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
|
||||
new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
|
||||
|
||||
if len(split_value_5) == 3:
|
||||
new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
|
||||
new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
|
||||
|
||||
row[1] = new_value2
|
||||
row[2] = new_value3
|
||||
row[3] = new_value4
|
||||
row[4] = new_value5
|
||||
#检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
|
||||
if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
|
||||
split_values = row[2].split('.')
|
||||
if len(split_values) == 3:
|
||||
new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
|
||||
new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
|
||||
row[1] = new_value2
|
||||
row[2] = new_value3
|
||||
#检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
|
||||
if not row[0].replace('.', '', 1).isdigit():
|
||||
for i in range(1, len(row) - 1):
|
||||
if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
|
||||
split_values = row[i + 1].split('%')
|
||||
new_value1 = f"{split_values[0]}%"
|
||||
new_value2 = f"{split_values[1]}"
|
||||
row[i] = new_value1
|
||||
row[i + 1] = new_value2
|
||||
break
|
||||
|
||||
|
||||
new_data = arr.tolist()#用于后面保存到数据库中
|
||||
new_data = utils.check_black_table_list(new_data)
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
arr_str = ''.join([''.join(map(str, row)) for row in arr])
|
||||
|
||||
#过滤掉不包含需抽取指标表格的文本
|
||||
matches = re.findall(STR_PATTERN, arr_str)
|
||||
pattern = re.findall(PATTERN,arr_str)
|
||||
muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
|
||||
if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
|
||||
if not tables_range.get(page_num):
|
||||
tables_range[page_num] = []
|
||||
|
||||
tables_range[page_num].append({
|
||||
'top' : top,
|
||||
'buttom' : buttom,
|
||||
'table_index' : table_index,
|
||||
'page_num' : page_num,
|
||||
})
|
||||
|
||||
db_service.insert_pdf_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num' : page_num,
|
||||
'page_count' : 100,
|
||||
'type' : 'parse_table',
|
||||
'content':{
|
||||
'top' : top,
|
||||
'buttom' : buttom,
|
||||
'page_num' : page_num,
|
||||
'table_index' : table_index,
|
||||
"type" : "table",
|
||||
"data" : new_data,
|
||||
'sort_num' : page_num*1000 - top
|
||||
}},conn_app,cursor_app)
|
||||
except Exception as e:
|
||||
print(f'camelot解析表格时出现了{e}')
|
||||
get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app)
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
redis_client.close()
|
||||
|
||||
end = time.time()
|
||||
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
|
||||
def text_in_table(top, tables_range, page_num):
|
||||
if tables_range.get(page_num):
|
||||
for range in tables_range[page_num]:
|
||||
if top < range['top'] and top > range['buttom']:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_text_type(text: str):
|
||||
text = re.sub(r"\s", "", text)
|
||||
first_re = '年度报告'
|
||||
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
||||
|
||||
if re.search(first_re, text.strip()):
|
||||
return 'page_header'
|
||||
|
||||
if page_number_pattern.match(text.strip()):
|
||||
return 'page_footer'
|
||||
|
||||
if len(text) < 20 and text.endswith('页'):
|
||||
return 'page_footer'
|
||||
|
||||
return 'text'
|
||||
|
||||
# 读取pdf文件中文本内容,不包括表格
|
||||
def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_client, conn_app, cursor_app):
|
||||
"""
|
||||
:return: 返回pdf文件中文本内容,不包括表格
|
||||
"""
|
||||
#print(f'tables_range 的值为{tables_range}')
|
||||
#print('----------------')
|
||||
#print(pages)
|
||||
|
||||
page_start = pages.split('-')[0]
|
||||
page_end = pages.split('-')[1]
|
||||
print(f'pages的值为{pages}')
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
record_select = cursor.fetchall()
|
||||
report_type = record_select[0][0]
|
||||
report_year = record_select[0][1]
|
||||
select_pdf_text_check = f"""select count(1) from pdf_text_info where file_id = {file_id}"""
|
||||
#check_if_empty_query = f"SELECT COUNT(*) FROM pdf_text_info where file_id = {file_id} and page_num = {page_num}"
|
||||
cursor.execute(select_pdf_text_check)
|
||||
is_empty = cursor.fetchone()[0] == 0
|
||||
query = "SELECT title_list,button_list FROM table_title_list WHERE report_year = %s"
|
||||
cursor_dict = conn.cursor(dictionary=True)
|
||||
cursor_dict.execute(query, (report_year,))
|
||||
result = cursor_dict.fetchone()
|
||||
title_list = result['title_list']
|
||||
button_list = result['button_list']
|
||||
|
||||
|
||||
# 我们从PDF中提取页面,page_numbers=[4,5,6]
|
||||
for pagenum, page in enumerate(extract_pages(pdf_path)):
|
||||
try:
|
||||
if pagenum+1 < int(page_start) or pagenum+1 > int(page_end):
|
||||
continue
|
||||
#更新redis已解析页码
|
||||
if not redis_client.exists(f'parsed_page_count_{file_id}'):
|
||||
redis_client.set(f'parsed_page_count_{file_id}', 0)
|
||||
|
||||
redis_client.incr(f'parsed_page_count_{file_id}')
|
||||
|
||||
# 找到所有的元素
|
||||
page_elements = [(element.y1, element) for element in page._objs]
|
||||
# 查找组成页面的元素
|
||||
line_texts = []
|
||||
#if not utils.pdf_text_flag(line_text):
|
||||
# line_texts.append(line_text)
|
||||
for i,component in enumerate(page_elements):
|
||||
# 提取页面布局的元素
|
||||
element = component[1]
|
||||
# 检查该元素是否为文本元素
|
||||
if isinstance(element, LTTextBoxHorizontal):
|
||||
# 检查文本是否出现在表中
|
||||
line_text = element.get_text().replace('\n','')
|
||||
line_text = re.sub(r"\s", "", line_text)
|
||||
|
||||
#提取符合要求的文本写入pdf_text_info,用于文本书写错误识别
|
||||
if not utils.pdf_text_flag(line_text):
|
||||
line_texts.append(line_text)
|
||||
#db_service.insert_pdf_text_info({
|
||||
# 'file_id': file_id,
|
||||
# 'page_num' : pagenum+1,
|
||||
# 'text' : line_text
|
||||
# },conn,cursor)
|
||||
|
||||
element_top = element.bbox[3]
|
||||
element_buttom = element.bbox[1]
|
||||
out_table_list = ['母公司现金流量表','母公司利润表','母公司资产负债表','子公司']
|
||||
|
||||
# 检查该文本是否出现在表中
|
||||
if tables_range.get(pagenum+1):
|
||||
for range in tables_range[pagenum+1]:
|
||||
if element_top < range['top'] and element_top > range['buttom']:#总是有母公司表被识别到上一个表里面:
|
||||
pass
|
||||
else:
|
||||
if element_top - range['top'] < 150 and element_top - range['top'] > 5 and (not text_in_table(element_top, tables_range, pagenum+1) or any(word in line_text for word in out_table_list)):#or any(word in line_text for word in out_table_list)
|
||||
text_type = get_text_type(line_text)
|
||||
if text_type in ('page_header','page_footer'):
|
||||
break
|
||||
if pagenum ==44:
|
||||
print(f'line_text在第44页的值有{line_text}')
|
||||
#这个对一整页都有用,会去掉很多正确的表
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0 :
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': pagenum+1,
|
||||
'type': 'parent_com',
|
||||
},conn_app,cursor_app)
|
||||
|
||||
|
||||
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
|
||||
table_info = {}
|
||||
if utils.check_table_title_black_list(line_text,title_list):
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': f"{range['page_num']}_{range['table_index']}",
|
||||
'type': 'table_index',
|
||||
},conn_app,cursor_app)
|
||||
if utils.check_table_title_black_list_measure(line_text):
|
||||
db_service.insert_measure_parser_info_measure({
|
||||
'file_id': file_id,
|
||||
'content': f"{range['page_num']}_{range['table_index']}",
|
||||
'type': 'measure_index',
|
||||
},conn_app,cursor_app,line_text)
|
||||
|
||||
if re.findall(unit_pattern, line_text):
|
||||
range['unit_flag'] = True
|
||||
table_info = get_table_unit_info(file_id,line_text,range['page_num'],range['table_index'])
|
||||
db_service.insert_table_unit_info_v1(table_info,conn,cursor)
|
||||
# if utils.check_table_title_black_list(line_text):
|
||||
# db_service.insert_measure_parser_info({
|
||||
# 'file_id': file_id,
|
||||
# 'content': f"{range['page_num']}_{range['table_index']}",
|
||||
# 'type': 'table_index',
|
||||
# },conn,cursor)
|
||||
else:
|
||||
if len(line_text) <= 5 or len(re.findall('单位|适用', line_text)) > 0 :
|
||||
pass
|
||||
#else:
|
||||
# table_info = get_table_text_info(file_id,line_text,range['page_num'],range['table_index'])
|
||||
# db_service.insert_table_text_info(table_info,conn,cursor)
|
||||
#通过关键词黑名单匹配表格上方的文本区域,提取需要过滤的表格
|
||||
# if utils.check_table_title_black_list(line_text):
|
||||
# db_service.insert_measure_parser_info({
|
||||
# 'file_id': file_id,
|
||||
# 'content': f"{range['page_num']}_{range['table_index']}",
|
||||
# 'type': 'table_index',
|
||||
# },conn,cursor)
|
||||
if utils.check_line_text(line_text):
|
||||
|
||||
db_service.insert_pdf_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num' : pagenum+1,
|
||||
'page_count' : 100,
|
||||
'type' : 'parse_table',
|
||||
'content':{
|
||||
'top' : element_top,
|
||||
'buttom' : element_buttom,
|
||||
'page_num' : range['page_num'],
|
||||
'table_index' : range['table_index'],
|
||||
"type" : text_type,
|
||||
'content' : line_text,
|
||||
'sort_num' : range['page_num']*1000 - element_top
|
||||
}},conn_app,cursor_app)
|
||||
|
||||
break
|
||||
#处理母公司表格标题在页面底部,完整表格在下一页
|
||||
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
|
||||
text_type = get_text_type(line_text)
|
||||
|
||||
if text_type == 'page_footer':
|
||||
continue
|
||||
|
||||
table_info = {}
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': pagenum+2,
|
||||
'type': 'parent_com',
|
||||
},conn_app,cursor_app)
|
||||
|
||||
#通过关键词黑名单匹配本页面末尾文字,如果出现
|
||||
if utils.check_table_title_black_list_button(line_text,button_list):
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': f"{pagenum+2}_1",
|
||||
'type': 'table_index',
|
||||
},conn_app,cursor_app)
|
||||
if utils.check_table_title_black_list_measure(line_text):
|
||||
db_service.insert_measure_parser_info_measure({
|
||||
'file_id': file_id,
|
||||
'content': f"{pagenum+2}_1",
|
||||
'type': 'measure_index',
|
||||
},conn_app,cursor_app,line_text)
|
||||
|
||||
if re.findall(unit_pattern, line_text):
|
||||
table_info = get_table_unit_info(file_id,line_text,pagenum+2,1)
|
||||
db_service.insert_table_unit_info(table_info,conn,cursor)
|
||||
|
||||
if utils.check_line_text(line_text):
|
||||
db_service.insert_pdf_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num' : pagenum+1,
|
||||
'page_count' : 100,
|
||||
'type' : 'parse_table',
|
||||
'content':{
|
||||
'top' : element_top,
|
||||
'buttom' : element_buttom,
|
||||
'page_num' : pagenum+1,
|
||||
"type" : text_type,
|
||||
'content' : line_text,
|
||||
'sort_num' : (pagenum+1)*1000 - element_top
|
||||
}},conn_app,cursor_app)
|
||||
if is_empty:
|
||||
db_service.batch_insert_page_text_nocheck({
|
||||
'file_id': file_id,
|
||||
'page_num' : pagenum+1,
|
||||
'text' : line_texts
|
||||
},conn,cursor)
|
||||
#print('文本这里没有重跑')
|
||||
else:
|
||||
db_service.batch_insert_page_text({
|
||||
'file_id': file_id,
|
||||
'page_num' : pagenum+1,
|
||||
'text' : line_texts
|
||||
},conn,cursor)
|
||||
except Exception as e:
|
||||
print(f'{pagenum}页处理异常')
|
||||
print(e)
|
||||
|
||||
|
||||
def get_table_unit_info(file_id,line_text,page_num,table_index):
|
||||
table_info = {}
|
||||
table_info['file_id'] = file_id
|
||||
match = unit_pattern.search(line_text)
|
||||
if match:
|
||||
unit = match.group(2)
|
||||
table_info['unit'] = unit
|
||||
|
||||
table_info['page_num'] = page_num
|
||||
table_info['table_index'] = table_index
|
||||
#print(table_info)
|
||||
return table_info
|
||||
|
||||
|
||||
def get_table_text_info(file_id,line_text,page_num,table_index):
|
||||
table_info = {}
|
||||
table_info['file_id'] = file_id
|
||||
table_info['text_info'] = line_text
|
||||
table_info['page_num'] = page_num
|
||||
table_info['table_index'] = table_index
|
||||
#print(table_info)
|
||||
return table_info
|
||||
|
||||
# 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
||||
def get_table_measure(file_id, pdf_tables, record_range):
|
||||
"""
|
||||
:return: pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
||||
"""
|
||||
try:
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
conn_app = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
record_select = cursor.fetchall()
|
||||
report_type = record_select[0][0]
|
||||
report_year = record_select[0][1]
|
||||
|
||||
client = MilvusClient(
|
||||
uri= MILVUS_CLIENT
|
||||
)
|
||||
print('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
t = pdf_tables[index]
|
||||
measure_obj =[]
|
||||
data_dict = {}
|
||||
measure_list = []
|
||||
try:
|
||||
arr = np.array(t['data'])
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
|
||||
row_num , col_num = -1 , -1
|
||||
|
||||
# 使用嵌套循环遍历数组,获取第一个数值位置
|
||||
for i in range(rows):
|
||||
for j in range(cols):
|
||||
if j == 0 or i == 0:#防止第一列识别出数字
|
||||
continue
|
||||
measure_value_config = str(arr[i, j]).replace('(','').replace(')','')
|
||||
|
||||
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config):
|
||||
if j == cols-1:
|
||||
row_num , col_num = i , j
|
||||
break
|
||||
elif (re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value_config)
|
||||
or measure_value_config == '-'):
|
||||
row_num , col_num = i , j
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
# 遍历数值二维数组,转成带语义的指标
|
||||
if row_num != -1 and col_num != -1:
|
||||
for i in range(row_num,arr.shape[0]):
|
||||
for j in range(col_num,arr.shape[1]):
|
||||
measure_value = str(arr[i, j]).replace('%','').replace('(','-').replace(')','')
|
||||
if measure_value == '-' or measure_value == '' or len(measure_value) > 20:
|
||||
continue
|
||||
else:
|
||||
row_num_info = get_row_num_info(arr,row_num,col_num,i,j)
|
||||
col_num_info = get_col_num_info(arr,row_num,col_num,i,j)
|
||||
|
||||
#如果上表头为空则认为是被截断,除了研发投入特殊处理其它过滤
|
||||
if row_num_info in ('','-',')',')'):
|
||||
continue
|
||||
|
||||
#特殊处理非经常性损益合计和非经常性损益净额同时出现时保留净额
|
||||
if col_num_info == '非经常性损益合计':
|
||||
continue
|
||||
|
||||
if utils.check_pdf_measure_black_list(f"{col_num_info}{row_num_info}"):
|
||||
continue
|
||||
|
||||
#去掉没有周期的指标
|
||||
if utils.check_pdf_measure(f"{col_num_info}{row_num_info}"):
|
||||
continue
|
||||
|
||||
#判断上表头和左表头周期是否一致,不一致过滤
|
||||
row_period = utils.get_period_type_other(row_num_info, report_year)
|
||||
col_period = utils.get_period_type_other(col_num_info, report_year)
|
||||
if(row_period != col_period and row_period != 'c_n' and col_period != 'c_n'):
|
||||
continue
|
||||
units_mapping = {
|
||||
"百万元": "百万元",
|
||||
"千万元": "千万元",
|
||||
"亿元": "亿元",
|
||||
"万元": "万元",
|
||||
"千元": "千元",
|
||||
"元": "元",
|
||||
"元/股": "元"
|
||||
}
|
||||
row_num_info = row_num_info.replace('%','增减')
|
||||
#num_info = f"{col_num_info}{row_num_info}".replace('()','').replace('加:','').replace('减:','').replace('%','')
|
||||
num_info = utils.get_clean_text(f"{row_num_info}{col_num_info}")
|
||||
num_info_bak = utils.get_clean_text(f"{col_num_info}{row_num_info}")
|
||||
measure_unit = ''
|
||||
#"%": "同期增减"
|
||||
combined_info = f"{row_num_info} {col_num_info}"
|
||||
# for unit in units_mapping:
|
||||
# if unit in row_num_info:
|
||||
# measure_unit = units_mapping[unit]
|
||||
# break
|
||||
if utils.get_percent_flag(row_num_info) == '1':
|
||||
measure_unit = ''
|
||||
else:
|
||||
for unit in units_mapping:
|
||||
if re.search(rf'\(\s*{unit}(\s*人民币)?\s*\)|\(\s*{unit}(\s*人民币)?\s*\)', combined_info) or (re.search(rf'{unit}', combined_info) and any(re.search('单位', item) for item in arr[0])):
|
||||
measure_unit = units_mapping[unit]
|
||||
break
|
||||
measure_list.append({
|
||||
'measure_name': num_info,
|
||||
'measure_value': measure_value,
|
||||
'measure_unit':measure_unit,
|
||||
})
|
||||
measure_list.append({
|
||||
'measure_name': num_info_bak,
|
||||
'measure_value': measure_value,
|
||||
'measure_unit':measure_unit,
|
||||
})
|
||||
|
||||
if not redis_client.exists(f'parsed_measure_count_{file_id}'):
|
||||
redis_client.set(f'parsed_measure_count_{file_id}', 0)
|
||||
|
||||
redis_client.incr(f'parsed_measure_count_{file_id}')
|
||||
|
||||
if len(measure_list) > 0:
|
||||
data_dict["measure_list"] = measure_list
|
||||
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
|
||||
data_dict['file_id'] = file_id
|
||||
measure_obj.append(data_dict)
|
||||
db_service.insert_measure_data_to_milvus(client,measure_obj,cursor_app,conn_app)
|
||||
except Exception as e:
|
||||
print(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
|
||||
print(f"错误是:{e}")
|
||||
end = time.time()
|
||||
print('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
|
||||
except Exception as e:
|
||||
print(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
t = pdf_tables[index]
|
||||
measure_obj =[]
|
||||
data_dict = {}
|
||||
measure_list = []
|
||||
try:
|
||||
arr = np.array(t['data'])
|
||||
except Exception as e:
|
||||
print(f'这个错误是{e}的arr的值是{arr}')
|
||||
|
||||
|
||||
finally:
|
||||
redis_client.close()
|
||||
client.close()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
|
||||
#多进程任务分发,根据参数判断是调表格还是正文
|
||||
def dispatch_job(job_info):
|
||||
try:
|
||||
type = job_info['type']
|
||||
path = job_info['path']
|
||||
file_id = job_info['file_id']
|
||||
page_num = job_info['page_num']
|
||||
tables_range = job_info['tables_range']
|
||||
if type == 'table':
|
||||
get_table_range(path, file_id, page_num, tables_range)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
#指标归一化处理
|
||||
|
||||
def update_measure_data(file_id,file_path,parent_table_pages):
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
# #通过向量查询指标
|
||||
conn_app = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
print(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service.delete_to_run(conn,cursor,file_id)
|
||||
db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
|
||||
|
||||
# #指标归一化处理
|
||||
db_service.update_ori_measure(conn,cursor,file_id)
|
||||
#db_service.delete_database(conn_app,cursor_app,file_id)
|
||||
cursor.close()
|
||||
conn.close()
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
|
||||
def merge_consecutive_arrays(pdf_info):
|
||||
merged_objects = []
|
||||
temp_array = {}
|
||||
|
||||
for info_obj in pdf_info:
|
||||
|
||||
try:
|
||||
if info_obj['type'] == 'table':
|
||||
# 如果对象是表格,将其元素添加到临时列表中
|
||||
if not temp_array.get('page_num'):
|
||||
temp_array = info_obj
|
||||
#else:
|
||||
# temp_array['data'].extend(info_obj['data'])
|
||||
elif len(temp_array['data'][0]) == len(info_obj['data'][0]):
|
||||
temp_array['data'].extend(info_obj['data'])
|
||||
else:
|
||||
if temp_array:
|
||||
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
else:
|
||||
# 如果对象不是表格,检查临时列表是否为空
|
||||
if temp_array:
|
||||
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
#print(info_obj)
|
||||
print(f"解析数据错误: {e}")
|
||||
|
||||
if temp_array:
|
||||
merged_objects.append(temp_array)
|
||||
|
||||
return merged_objects
|
||||
|
||||
def merge_consecutive_arrays_v1(pdf_info):
|
||||
merged_objects = []
|
||||
temp_array = {}
|
||||
|
||||
def is_same_dimension(data1, data2):
|
||||
# 检查两个表的每行长度是否相同
|
||||
if len(data1) != len(data2):
|
||||
return False
|
||||
return all(len(row1) == len(row2) for row1, row2 in zip(data1, data2))
|
||||
|
||||
for info_obj in pdf_info:
|
||||
try:
|
||||
if info_obj['type'] == 'table':
|
||||
if not temp_array:
|
||||
# 如果临时列表为空,则初始化临时列表
|
||||
temp_array = info_obj
|
||||
else:
|
||||
# 检查当前表与临时列表中的表是否同维度
|
||||
if is_same_dimension(temp_array['data'], info_obj['data']):
|
||||
# 如果是同维度,则合并数据
|
||||
temp_array['data'].extend(info_obj['data'])
|
||||
else:
|
||||
# 如果不是同维度,将现有临时列表添加到结果中,并重置临时列表
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = info_obj
|
||||
else:
|
||||
# 如果对象不是表格,检查临时列表是否非空
|
||||
if temp_array:
|
||||
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
||||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
print(f"解析数据错误: {e}")
|
||||
|
||||
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
|
||||
if temp_array:
|
||||
merged_objects.append(temp_array)
|
||||
|
||||
return merged_objects
|
||||
def start_table_measure_job(file_id):
|
||||
conn_app = mysql.connector.connect(
|
||||
host = MYSQL_HOST_APP,
|
||||
user = MYSQL_USER_APP,
|
||||
password = MYSQL_PASSWORD_APP,
|
||||
database = MYSQL_DB_APP
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
|
||||
select_process_query = '''
|
||||
select content from pdf_parse_process WHERE file_id = '{file_id}' and type='parse_table'
|
||||
'''.format(file_id=file_id)
|
||||
cursor_app.execute(select_process_query)
|
||||
records = cursor_app.fetchall()
|
||||
pdf_info = []
|
||||
for record in records:
|
||||
pdf_info.append(eval(record[0]))
|
||||
|
||||
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
||||
pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'measure_count_{file_id}', len(pdf_tables))
|
||||
|
||||
cursor_app.close()
|
||||
conn_app.close()
|
||||
redis_client.close()
|
||||
|
||||
records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT)
|
||||
print(f'records_range_part识别页码的值为{records_range_parts}')
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
p = Process(target=get_table_measure, args=(file_id,pdf_tables,record_range,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
file_id = '1778'
|
||||
page_num = 11
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
|
||||
select_process_query = '''
|
||||
select content from pdf_parse_process WHERE file_id = '{file_id}' and type='parse_table'
|
||||
and page_num in(41,42,43)
|
||||
'''.format(file_id=file_id, page_num=page_num)
|
||||
cursor.execute(select_process_query)
|
||||
records = cursor.fetchall()
|
||||
pdf_info = []
|
||||
for record in records:
|
||||
pdf_info.append(eval(record[0]))
|
||||
|
||||
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
||||
pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
|
||||
|
||||
get_table_measure(file_id,pdf_tables,'0-2')
|
|
@ -0,0 +1,269 @@
|
|||
from docx import Document
|
||||
import json
|
||||
from docx.oxml.table import CT_Tbl
|
||||
from docx.oxml.text.paragraph import CT_P
|
||||
from lxml import etree
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
RESULT_TYPE_TEXT = 'text'
|
||||
RESULT_TYPE_TABLE = 'table'
|
||||
|
||||
def build_result(result_type, index, data):
|
||||
return {
|
||||
'type': result_type,
|
||||
'index': index,
|
||||
'data': data
|
||||
}
|
||||
|
||||
def build_catalog_result(index, depth, data):
|
||||
return {
|
||||
'index': index,
|
||||
'depth': depth,
|
||||
'data': data
|
||||
}
|
||||
|
||||
# 解析docx文件中的XML内容
|
||||
def get_xml_content(docx_filename, xml_filename):
|
||||
with zipfile.ZipFile(docx_filename) as z:
|
||||
return z.read(xml_filename)
|
||||
|
||||
def parse_paragraph(paragraph, index, namespaces):
|
||||
paragraph_text = paragraph.text.strip() if paragraph else ''
|
||||
if paragraph_text:
|
||||
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
|
||||
return None
|
||||
|
||||
def parse_table(table, index):
|
||||
table_data = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text for cell in row.cells]
|
||||
table_data.append(row_data)
|
||||
return build_result(RESULT_TYPE_TABLE, index, table_data)
|
||||
|
||||
def parse_paragraph_element(paragraph_element, index, namespaces):
|
||||
paragraph_xml = etree.fromstring(paragraph_element.xml)
|
||||
paragraph_text = ''.join(paragraph_xml.xpath('//w:t/text()', namespaces=namespaces)).strip()
|
||||
if paragraph_text:
|
||||
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
|
||||
return None
|
||||
|
||||
def parse_table_element(table_element, index, namespaces):
|
||||
table_xml = etree.fromstring(table_element.xml)
|
||||
table_data = []
|
||||
for row in table_xml.xpath('//w:tr', namespaces=namespaces):
|
||||
row_data = []
|
||||
for cell in row.xpath('./w:tc | ./w:sdt', namespaces=namespaces):
|
||||
cell_text = ''.join(cell.xpath('.//w:t/text()', namespaces=namespaces)).strip()
|
||||
grid_span_xpath = etree.XPath('w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
|
||||
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
|
||||
if grid_span > 1:
|
||||
row_data.extend([cell_text] * grid_span)
|
||||
else:
|
||||
row_data.append(cell_text)
|
||||
table_data.append(row_data)
|
||||
return build_result(RESULT_TYPE_TABLE, index, table_data)
|
||||
|
||||
def add_to_catalog(element_xml, index, catalog_content, namespaces, paragraph_text, heading_styles):
|
||||
p_element = etree.fromstring(element_xml)
|
||||
# outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
|
||||
# if outlineLvl:
|
||||
# level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
|
||||
# catalog_content.append(build_catalog_result(index, level, paragraph_text))
|
||||
level = is_heading_paragraph(p_element, heading_styles, namespaces)
|
||||
if level != -1:
|
||||
catalog_content.append(build_catalog_result(index, level, paragraph_text))
|
||||
# 检查段落是否为标题样式
|
||||
def is_heading_paragraph(paragraph, heading_styles, namespaces):
|
||||
pPr = paragraph.find('.//w:pPr', namespaces=namespaces)
|
||||
if pPr is not None:
|
||||
pStyle = pPr.find('.//w:pStyle', namespaces=namespaces)
|
||||
pOutLineLvl = pPr.find('.//w:outlineLvl', namespaces=namespaces)
|
||||
if pStyle is not None:
|
||||
style_val = pStyle.get(f"{{{namespaces['w']}}}val")
|
||||
if style_val.isdigit():
|
||||
return int(style_val)
|
||||
if pOutLineLvl is not None:
|
||||
outLineLvl_val = pOutLineLvl.get(f"{{{namespaces['w']}}}val")
|
||||
if outLineLvl_val.isdigit():
|
||||
return int(outLineLvl_val) + 1
|
||||
# if pStyle is not None and pStyle.get(ns['w'] + 'val') in heading_styles:
|
||||
# if style_val > 0:
|
||||
# return True
|
||||
return -1
|
||||
|
||||
def get_paragraph_text(paragraph_element, namespaces):
|
||||
paragraph_text = ''
|
||||
for run in paragraph_element.findall('.//w:r', namespaces=namespaces):
|
||||
for text in run.findall('.//w:t', namespaces=namespaces):
|
||||
paragraph_text += text.text if text.text is not None else ''
|
||||
return paragraph_text
|
||||
|
||||
def add_to_catalog_paragraph(text, index, catalog_content, namespaces):
|
||||
# 添加段落到目录
|
||||
catalog_content.append(build_catalog_result(index, 1, text)) # 假设默认级别为1
|
||||
|
||||
def parse_sdt_catalog(sdt_element, catalog_content, index, namespaces):
|
||||
sdt_content = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
|
||||
if sdt_content is not None:
|
||||
for child in sdt_content:
|
||||
if child.tag.endswith('p'): # 内容控件中的段落
|
||||
paragraph_text = get_paragraph_text(child, namespaces)
|
||||
if paragraph_text.strip(): # 检查文本是否为空
|
||||
add_to_catalog_paragraph(paragraph_text, index, catalog_content, namespaces)
|
||||
index += 1 # 更新索引
|
||||
elif child.tag.endswith('tbl'): # 内容控件中的表格
|
||||
# 处理表格内容(如果需要)
|
||||
pass
|
||||
elif child.tag.endswith('sdt'): # 嵌套的内容控件
|
||||
index = parse_sdt_catalog(child, catalog_content, index, namespaces) # 递归解析嵌套的内容控件
|
||||
return index
|
||||
|
||||
def parse_docx(docx_path):
|
||||
try:
|
||||
document = Document(docx_path)
|
||||
styles_xml = get_xml_content(docx_path, 'word/styles.xml')
|
||||
except Exception as e:
|
||||
print(f"Error loading document: {e}")
|
||||
return None, None
|
||||
|
||||
doc_content = [] # 内容(文本+表格)
|
||||
catalog_content = [] # 目录
|
||||
current_index = 1 # 维护全局的 index 变量
|
||||
paragraph_index = 0
|
||||
table_index = 0
|
||||
# 获取整个文档的XML内容
|
||||
xml_root = document.part.element
|
||||
namespaces = xml_root.nsmap
|
||||
|
||||
# 获取所有标题样式
|
||||
styles_root = etree.fromstring(styles_xml)
|
||||
heading_styles = set()
|
||||
for style in styles_root.xpath('//w:style', namespaces=namespaces):
|
||||
style_type = style.get(namespaces['w'] + 'type')
|
||||
if style_type == 'paragraph' and style.get(namespaces['w'] + 'styleId').startswith('Heading'):
|
||||
heading_styles.add(style.get(namespaces['w'] + 'styleId'))
|
||||
|
||||
# 遍历文档中的所有元素
|
||||
for i, element in enumerate(document.element.body):
|
||||
if isinstance(element, CT_P): # 段落
|
||||
paragraph_result = parse_paragraph_element(element, current_index, namespaces)
|
||||
if paragraph_result:
|
||||
doc_content.append(paragraph_result)
|
||||
# 判断是否为目录,是就插入目录内容
|
||||
paragraph = document.paragraphs[paragraph_index]
|
||||
add_to_catalog(paragraph._element.xml, current_index, catalog_content, namespaces, paragraph.text, heading_styles)
|
||||
current_index += 1 # 更新 index
|
||||
paragraph_index += 1
|
||||
elif isinstance(element, CT_Tbl): # 表格
|
||||
table_result = parse_table_element(element, current_index, namespaces)
|
||||
if table_result:
|
||||
doc_content.append(table_result)
|
||||
current_index += 1 # 更新 index
|
||||
table_index += 1
|
||||
elif element.tag.endswith('sdt'): # 内容控件
|
||||
current_index = parse_sdt(element, doc_content, current_index, namespaces, catalog_content, heading_styles) # 更新索引
|
||||
|
||||
return json.dumps(doc_content, indent=4, ensure_ascii=False), json.dumps(catalog_content, indent=4, ensure_ascii=False)
|
||||
|
||||
|
||||
|
||||
def parse_sdt(sdt_element, doc_content, current_index, namespaces, catalog_content, heading_styles):
|
||||
sdtContent = sdt_element.find('.//w:sdtContent', namespaces=namespaces)
|
||||
if sdtContent is not None:
|
||||
for child in sdtContent:
|
||||
if child.tag.endswith('p'): # 内容控件中的段落
|
||||
paragraph_text = ''
|
||||
for run in child.findall('.//w:r', namespaces=namespaces):
|
||||
for text in run.findall('.//w:t', namespaces=namespaces):
|
||||
paragraph_text += text.text if text.text is not None else ''
|
||||
if paragraph_text.strip(): # 检查文本是否为空
|
||||
doc_content.append(build_result(RESULT_TYPE_TEXT, current_index, paragraph_text.strip()))
|
||||
# 判断是否为目录,是就插入目录内容
|
||||
add_to_catalog(child.xml, current_index, catalog_content, namespaces, paragraph_text, heading_styles)
|
||||
current_index += 1 # 更新索引
|
||||
elif child.tag.endswith('tbl'): # 内容控件中的表格
|
||||
table_data = []
|
||||
merged_cells = {} # 用于记录跨行单元格的信息
|
||||
for row_idx, row in enumerate(child.findall('.//w:tr', namespaces=namespaces)):
|
||||
row_data = []
|
||||
for col_idx, cell in enumerate(row.findall('.//w:tc', namespaces=namespaces)):
|
||||
cell_text = ''
|
||||
for run in cell.findall('.//w:r', namespaces=namespaces):
|
||||
for text in run.findall('.//w:t', namespaces=namespaces):
|
||||
cell_text += text.text if text.text is not None else ''
|
||||
|
||||
# 检查单元格是否跨列
|
||||
grid_span_xpath = etree.XPath('w:tcPr/w:gridSpan/@w:val', namespaces=namespaces)
|
||||
grid_span = int(grid_span_xpath(cell)[0]) if grid_span_xpath(cell) else 1
|
||||
if grid_span > 1:
|
||||
row_data.extend([cell_text.strip()] * grid_span)
|
||||
else:
|
||||
row_data.append(cell_text.strip())
|
||||
|
||||
# 检查单元格是否跨行
|
||||
v_merge_xpath = etree.XPath('w:tcPr/w:vMerge/@w:val', namespaces=namespaces)
|
||||
v_merge = v_merge_xpath(cell)
|
||||
if v_merge and v_merge[0] == 'restart':
|
||||
merged_cells[(row_idx, col_idx)] = (int(grid_span), 1)
|
||||
elif v_merge and v_merge[0] == 'continue':
|
||||
if (row_idx - 1, col_idx) in merged_cells:
|
||||
merged_cells[(row_idx - 1, col_idx)] = (merged_cells[(row_idx - 1, col_idx)][0], merged_cells[(row_idx - 1, col_idx)][1] + 1)
|
||||
# 跨行单元格不需要再次添加到 row_data 中
|
||||
else:
|
||||
# 只有非跨行单元格才需要添加到 row_data 中
|
||||
pass
|
||||
|
||||
# 处理跨行单元格
|
||||
for (r, c), (col_span, row_span) in list(merged_cells.items()):
|
||||
if r < row_idx:
|
||||
for i in range(row_span):
|
||||
if r + i == row_idx:
|
||||
row_data[c:c] = [row_data[c]] * (col_span - 1)
|
||||
break
|
||||
if r + row_span - 1 == row_idx:
|
||||
del merged_cells[(r, c)]
|
||||
|
||||
table_data.append(row_data)
|
||||
if table_data: # 检查表格数据是否为空
|
||||
doc_content.append(build_result(RESULT_TYPE_TABLE, current_index, table_data))
|
||||
current_index += 1 # 更新索引
|
||||
elif child.tag.endswith('sdt'): # 嵌套的内容控件
|
||||
current_index = parse_sdt(child, doc_content, current_index, namespaces, catalog_content, heading_styles) # 递归解析嵌套的内容控件
|
||||
return current_index # 返回更新后的索引
|
||||
|
||||
def split_text_table(json_data):
|
||||
# 分组
|
||||
text_elements = [element for element in json_data if element['type'] == 'text']
|
||||
table_elements = [element for element in json_data if element['type'] == 'table']
|
||||
|
||||
# 转换为JSON字符串
|
||||
text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
|
||||
table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
|
||||
|
||||
return text_elements_json, table_elements_json
|
||||
|
||||
def append_to_file(file_path, text):
|
||||
try:
|
||||
with open(file_path, 'a', encoding='utf-8') as file:
|
||||
file.write(text + '\n')
|
||||
except Exception as e:
|
||||
print(f"Error writing to file: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
current_directory = os.getcwd()
|
||||
docx_relative_path = 'file/docx/101.docx'
|
||||
file_relative_path = 'file/docx/test1.txt'
|
||||
docx_path = os.path.join(current_directory, docx_relative_path)
|
||||
file_path = os.path.join(current_directory, file_relative_path)
|
||||
try:
|
||||
parsed_content, catalog_content = parse_docx(docx_path)
|
||||
if parsed_content and catalog_content:
|
||||
json_parsed_content = json.loads(parsed_content)
|
||||
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
|
||||
|
||||
append_to_file(file_path, text_elements_json)
|
||||
append_to_file(file_path, table_elements_json)
|
||||
append_to_file(file_path, catalog_content)
|
||||
except Exception as e:
|
||||
print(f"Error parse_docx: {e}")
|
|
@ -0,0 +1,108 @@
|
|||
from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
import mysql.connector
|
||||
from http import HTTPStatus
|
||||
import dashscope
|
||||
import random,re
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextBoxHorizontal
|
||||
|
||||
dashscope.api_key='sk-63c02fbb9b7d4b0494a3200bec1ae286'
|
||||
|
||||
def get_company_name(file_path):
|
||||
line_text = ''
|
||||
# 我们从PDF中提取页面,page_numbers=[4,5,6]
|
||||
for pagenum, page in enumerate(extract_pages(file_path)):
|
||||
if pagenum > 1:
|
||||
break
|
||||
# 找到所有的元素
|
||||
page_elements = [(element.y1, element) for element in page._objs]
|
||||
# 查找组成页面的元素
|
||||
for i,component in enumerate(page_elements):
|
||||
# 提取页面布局的元素
|
||||
element = component[1]
|
||||
# 检查该元素是否为文本元素
|
||||
if isinstance(element, LTTextBoxHorizontal):
|
||||
# 检查文本是否出现在表中
|
||||
line_text += element.get_text()
|
||||
|
||||
return llm_service(line_text)
|
||||
|
||||
def llm_service(user_prompt):
|
||||
|
||||
system_prompt = '''
|
||||
从以下数据报告中提取公司全称,只需要提取中文公司全称,不要增加其他内容,如果提取不到公司全称,请返回-。
|
||||
<数据报告>
|
||||
<user_prompt>
|
||||
</数据报告>
|
||||
'''
|
||||
system_prompt = system_prompt.replace('<user_prompt>', user_prompt)
|
||||
response = dashscope.Generation.call(
|
||||
model='qwen-plus',
|
||||
prompt = system_prompt,
|
||||
seed=random.randint(1, 10000),
|
||||
top_p=0.8,
|
||||
result_format='message',
|
||||
enable_search=False,
|
||||
max_tokens=1500,
|
||||
temperature=0.85,
|
||||
repetition_penalty=1.0
|
||||
)
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
result = response['output']['choices'][0]['message']['content']
|
||||
return result
|
||||
else:
|
||||
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
response.request_id, response.status_code,
|
||||
response.code, response.message
|
||||
))
|
||||
|
||||
return "llm_error"
|
||||
|
||||
def update_company_name(file_id, company_name, cursor, conn):
|
||||
update_sql = f'''
|
||||
UPDATE report_check
|
||||
SET c_name = '{company_name}'
|
||||
WHERE id = {file_id}
|
||||
'''
|
||||
cursor.execute(update_sql)
|
||||
conn.commit()
|
||||
|
||||
if __name__ == '__main__':
|
||||
conn = mysql.connector.connect(
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor()
|
||||
|
||||
data_query = '''
|
||||
SELECT id,file_path FROM report_check where c_name is null
|
||||
'''
|
||||
|
||||
cursor.execute(data_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
for data in data_list:
|
||||
try:
|
||||
file_id = data[0]
|
||||
file_path = f'/usr/local/zhanglei/financial/{data[1]}'
|
||||
print(f'财报{file_id}开始解析')
|
||||
# file_id = '1329'
|
||||
# file_path = '/Users/zhengfei/Desktop/cb/zhangjun-600271-2023-nb-nb.pdf'
|
||||
|
||||
company_name = get_company_name(file_path)
|
||||
contains_newline = '\n' in company_name
|
||||
if contains_newline:
|
||||
lines = company_name.splitlines(True)
|
||||
company_name = lines[0]
|
||||
|
||||
if company_name != "llm_error":
|
||||
update_company_name(file_id, company_name, cursor, conn)
|
||||
except Exception as e:
|
||||
print(f'财报{file_id}解析失败',e)
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
|
@ -0,0 +1,240 @@
|
|||
import PyPDF2
|
||||
import re
|
||||
import os,threading
|
||||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
import db_service
|
||||
def get_tree_pages(root, info, depth=0,title_array=[]):
|
||||
"""
|
||||
Recursively iterate the outline tree
|
||||
Find the pages pointed by the outline item
|
||||
and get the assigned physical order id
|
||||
|
||||
Decrement with padding if necessary
|
||||
"""
|
||||
|
||||
if isinstance(root, dict):
|
||||
# print(root)
|
||||
page = root['/Page'].get_object()
|
||||
# print(id(page))
|
||||
t = root['/Title']
|
||||
title = t
|
||||
if isinstance(t, PyPDF2.generic.ByteStringObject):
|
||||
title = t.original_bytes.decode('utf8')
|
||||
title = title.strip()
|
||||
title = title.replace('\n', '')
|
||||
title = title.replace('\r', '')
|
||||
|
||||
page_num = info['all_pages'].get(id(page), 0)
|
||||
if page_num == 0:
|
||||
print('Not found page number for /Page!', page)
|
||||
elif page_num < info['padding']:
|
||||
page_num = 0
|
||||
else:
|
||||
page_num -= info['padding']
|
||||
|
||||
|
||||
# str_val = '%-5d' % page_num
|
||||
# str_val += '\t' * depth
|
||||
# str_val += title + '\t' + '%3d' % page_num
|
||||
# print(str_val)
|
||||
title_array.append({
|
||||
'title': title,
|
||||
'page_num': page_num,
|
||||
'depth': depth
|
||||
})
|
||||
for elem in root:
|
||||
get_tree_pages(elem, info, depth+1,title_array)
|
||||
return title_array
|
||||
|
||||
|
||||
def recursive_numbering(obj, info):
|
||||
"""
|
||||
Recursively iterate through all the pages in order and assign them a physical
|
||||
order number
|
||||
"""
|
||||
# print(id(obj), obj)
|
||||
if obj['/Type'] == '/Page':
|
||||
obj_id = id(obj)
|
||||
if obj_id not in info['all_pages']:
|
||||
info['all_pages'][obj_id] = info['current_page_id']
|
||||
info['current_page_id'] += 1
|
||||
return
|
||||
elif obj['/Type'] == '/Pages':
|
||||
for page in obj['/Kids']:
|
||||
recursive_numbering(page.get_object(), info)
|
||||
|
||||
def get_numbers_between(numbers_between,start, end):
|
||||
# 初始化一个空列表来存储两个数字之间的所有数字
|
||||
|
||||
# 遍历从开始数字到结束数字之间的每个数字
|
||||
for i in range(start, end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
numbers_between.append(i)
|
||||
return numbers_between
|
||||
|
||||
def get_page_end(start, depth, title_array):
|
||||
page_end = -1
|
||||
for i in range(start, len(title_array)):
|
||||
if title_array[i]['depth'] == depth:
|
||||
page_end = title_array[i]['page_num']
|
||||
break
|
||||
return page_end
|
||||
|
||||
def get_file_split(page_count):
|
||||
# 获取 CPU 核数
|
||||
cpu_count = os.cpu_count()
|
||||
if page_count < cpu_count:
|
||||
cpu_count = page_count
|
||||
# 使用 divmod() 函数计算除法结果和余数
|
||||
quotient, remainder = divmod(page_count, cpu_count)
|
||||
table_split_parts = []
|
||||
text_split_parts = []
|
||||
for i in range(cpu_count):
|
||||
start_num = i * quotient
|
||||
if i < cpu_count-1:
|
||||
start_num = i * quotient
|
||||
end_num = start_num+quotient
|
||||
else:
|
||||
end_num = page_count
|
||||
table_split_parts.append(f'{start_num}-{end_num}')
|
||||
text_split_parts.append(get_numbers_between([],start_num, end_num))
|
||||
|
||||
# 返回除法结果和余数
|
||||
return {
|
||||
'table_split_parts': table_split_parts,
|
||||
'text_split_parts': text_split_parts
|
||||
}
|
||||
|
||||
def create_text_outline(pdf_path, file_id):
|
||||
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
|
||||
# creating an object
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_info = {}
|
||||
fileReader = PyPDF2.PdfReader(file)
|
||||
page_count = len(fileReader.pages)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'page_count_{file_id}', page_count)
|
||||
|
||||
info = {
|
||||
'page_count': page_count,
|
||||
'all_pages': {},
|
||||
'current_page_id': 1,
|
||||
'padding': 0
|
||||
}
|
||||
|
||||
print('Number of pages: %d' % info['page_count'])
|
||||
|
||||
pages = fileReader.trailer['/Root']['/Pages'].get_object()
|
||||
recursive_numbering(pages, info)
|
||||
#for page_num, page in enumerate(pages['/Kids']):
|
||||
# page_obj = page.getObject()
|
||||
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
|
||||
title_array = get_tree_pages(fileReader.outline, info, 0, [])
|
||||
db_service.pdf_title_insert_mysql(file_id,title_array)
|
||||
title_array = db_service.get_file_info_from_mysql(file_id)
|
||||
|
||||
parent_table_pages_local = {}
|
||||
parent_table_pages_local[file_id] = []
|
||||
print(f'{file_id}:{len(title_array)}')
|
||||
for i in range(len(title_array)):
|
||||
title_obj = title_array[i]
|
||||
title = title_obj['title']
|
||||
#print(f'标题分别是{title}')
|
||||
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
|
||||
page_start = title_obj['page_num']
|
||||
depth = title_obj['depth']
|
||||
if i < len(title_array) - 1:
|
||||
page_end = title_array[i+1]['page_num']
|
||||
if title_array[i]['depth'] in [1,2]:
|
||||
page_end = get_page_end(i+1, depth, title_array)
|
||||
else:
|
||||
page_end = page_count
|
||||
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
|
||||
|
||||
#当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
|
||||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
|
||||
file_info['split_parts'] = get_file_split(page_count)
|
||||
|
||||
redis_client.close()
|
||||
|
||||
return file_info
|
||||
|
||||
|
||||
def create_text_outline_disclosure(pdf_path, file_id):
|
||||
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
|
||||
# creating an object
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_info = {}
|
||||
fileReader = PyPDF2.PdfReader(file)
|
||||
page_count = len(fileReader.pages)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'page_count_{file_id}', page_count)
|
||||
|
||||
info = {
|
||||
'page_count': page_count,
|
||||
'all_pages': {},
|
||||
'current_page_id': 1,
|
||||
'padding': 0
|
||||
}
|
||||
|
||||
print('Number of pages: %d' % info['page_count'])
|
||||
|
||||
pages = fileReader.trailer['/Root']['/Pages'].get_object()
|
||||
recursive_numbering(pages, info)
|
||||
#for page_num, page in enumerate(pages['/Kids']):
|
||||
# page_obj = page.getObject()
|
||||
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
|
||||
title_array = get_tree_pages(fileReader.outline, info, 0, [])
|
||||
#db_service.pdf_title_insert_mysql(file_id,title_array)
|
||||
#title_array = db_service.get_file_info_from_mysql(file_id)
|
||||
|
||||
parent_table_pages_local = {}
|
||||
parent_table_pages_local[file_id] = []
|
||||
print(f'{file_id}:{len(title_array)}')
|
||||
for i in range(len(title_array)):
|
||||
title_obj = title_array[i]
|
||||
title = title_obj['title']
|
||||
#print(f'标题分别是{title}')
|
||||
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
|
||||
page_start = title_obj['page_num']
|
||||
depth = title_obj['depth']
|
||||
if i < len(title_array) - 1:
|
||||
page_end = title_array[i+1]['page_num']
|
||||
if title_array[i]['depth'] in [1,2]:
|
||||
page_end = get_page_end(i+1, depth, title_array)
|
||||
else:
|
||||
page_end = page_count
|
||||
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
|
||||
|
||||
#当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
|
||||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
|
||||
file_info['split_parts'] = get_file_split(page_count)
|
||||
|
||||
redis_client.close()
|
||||
|
||||
return file_info
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
|
||||
|
||||
threading.Thread(target=create_text_outline, args=(path,'111')).start()
|
||||
time.sleep(5)
|
||||
threading.Thread(target=create_text_outline, args=(path,'222')).start()
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
#报错提示
|
||||
import paramiko
|
||||
import time
|
||||
import threading
|
||||
|
||||
# 执行命令的函数
|
||||
def execute_commands_on_server(hostname, username, password, host):
|
||||
try:
|
||||
# 连接到服务器
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname=hostname, username=username, password=password)
|
||||
|
||||
# 执行命令
|
||||
shell = client.invoke_shell()
|
||||
#启动docker
|
||||
shell.send("cd /root/pdf_parser/zzb_data_prod\n")
|
||||
time.sleep(1)
|
||||
shell.send("conda activate py310\n")
|
||||
time.sleep(1)
|
||||
shell.send("ps -ef | grep app_word.py | grep -v grep | awk '{print $2}' | xargs -r kill -9\n")
|
||||
time.sleep(1)
|
||||
shell.send("nohup python app_word.py > app.log 2>&1 &\n")
|
||||
time.sleep(1)
|
||||
# 读取输出
|
||||
output = shell.recv(2048).decode()
|
||||
print(f"Output from {hostname}:\n{output}")
|
||||
|
||||
except paramiko.SSHException as e:
|
||||
print(f"SSH connection error with {hostname}: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
# 创建线程函数
|
||||
def thread_function(server):
|
||||
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
|
||||
|
||||
servers = [
|
||||
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器1'},
|
||||
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器2'},
|
||||
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器3'},
|
||||
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器4'},
|
||||
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器5'},
|
||||
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '企业服务器6'},
|
||||
|
||||
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '监管服务器1'},
|
||||
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '监管服务器2'},
|
||||
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '监管服务器3'},
|
||||
|
||||
{'hostname': '1.94.106.10', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器1'},
|
||||
{'hostname': '1.94.182.142', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器2'},
|
||||
{'hostname': '119.3.153.192', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器3'},
|
||||
|
||||
# {'hostname': '192.168.0.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器2'},
|
||||
# {'hostname': '192.168.0.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器3'},
|
||||
#
|
||||
]
|
||||
|
||||
# 创建并启动线程
|
||||
threads = []
|
||||
for server in servers:
|
||||
thread = threading.Thread(target=thread_function, args=(server,))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
# 等待所有线程完成
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
print("All commands executed.")
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
#报错提示
|
||||
import paramiko
|
||||
import time
|
||||
import threading
|
||||
|
||||
# 执行命令的函数
|
||||
def execute_commands_on_server(hostname, username, password, host):
|
||||
try:
|
||||
# 连接到服务器
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname=hostname, username=username, password=password)
|
||||
|
||||
# 执行命令
|
||||
shell = client.invoke_shell()
|
||||
#启动docker
|
||||
shell.send("cd /root/pdf_parser/zzb_data_prod\n")
|
||||
time.sleep(1)
|
||||
shell.send("conda activate py310\n")
|
||||
time.sleep(1)
|
||||
shell.send("ps -ef | grep app.py | grep -v grep | awk '{print $2}' | xargs -r kill -9\n")
|
||||
time.sleep(1)
|
||||
shell.send("nohup python app.py > app.log 2>&1 &\n")
|
||||
time.sleep(1)
|
||||
# 读取输出
|
||||
output = shell.recv(2048).decode()
|
||||
print(f"Output from {hostname}:\n{output}")
|
||||
|
||||
except paramiko.SSHException as e:
|
||||
print(f"SSH connection error with {hostname}: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
# 创建线程函数
|
||||
def thread_function(server):
|
||||
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
|
||||
|
||||
servers = [
|
||||
{'hostname': '192.168.0.163', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器1'},
|
||||
{'hostname': '192.168.0.26', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器2'},
|
||||
{'hostname': '192.168.0.2', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器3'},
|
||||
{'hostname': '192.168.0.128', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器4'},
|
||||
{'hostname': '192.168.0.136', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器5'},
|
||||
{'hostname': '192.168.0.239', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器6'},
|
||||
{'hostname': '192.168.0.108', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器1'},
|
||||
{'hostname': '192.168.0.131', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器2'},
|
||||
{'hostname': '192.168.0.205', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器3'},
|
||||
|
||||
# {'hostname': '192.168.0.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器2'},
|
||||
# {'hostname': '192.168.0.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器3'},
|
||||
#
|
||||
]
|
||||
|
||||
# 创建并启动线程
|
||||
threads = []
|
||||
for server in servers:
|
||||
thread = threading.Thread(target=thread_function, args=(server,))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
# 等待所有线程完成
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
print("All commands executed.")
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
#报错提示
|
||||
import paramiko
|
||||
import time
|
||||
import threading
|
||||
|
||||
# 执行命令的函数
|
||||
def execute_commands_on_server(hostname, username, password, host):
|
||||
try:
|
||||
# 连接到服务器
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname=hostname, username=username, password=password)
|
||||
|
||||
# 执行命令
|
||||
shell = client.invoke_shell()
|
||||
#启动docker
|
||||
shell.send("cd /root/pdf_parser/zzb_data_word\n")
|
||||
time.sleep(1)
|
||||
shell.send("conda activate py310\n")
|
||||
time.sleep(1)
|
||||
shell.send("ps -ef | grep app_word.py | grep -v grep | awk '{print $2}' | xargs -r kill -9\n")
|
||||
time.sleep(1)
|
||||
shell.send("nohup python app_word.py > app_word.log 2>&1 &\n")
|
||||
time.sleep(1)
|
||||
# 读取输出
|
||||
output = shell.recv(2048).decode()
|
||||
print(f"Output from {hostname}:\n{output}")
|
||||
|
||||
except paramiko.SSHException as e:
|
||||
print(f"SSH connection error with {hostname}: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
# 创建线程函数
|
||||
def thread_function(server):
|
||||
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
|
||||
|
||||
servers = [
|
||||
# {'hostname': '192.168.0.163', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器1'},
|
||||
# {'hostname': '192.168.0.26', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器2'},
|
||||
# {'hostname': '192.168.0.2', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器3'},
|
||||
# {'hostname': '192.168.0.128', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器4'},
|
||||
# {'hostname': '192.168.0.136', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器5'},
|
||||
# {'hostname': '192.168.0.239', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器6'},
|
||||
# {'hostname': '192.168.0.108', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器1'},
|
||||
# {'hostname': '192.168.0.131', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器2'},
|
||||
#{'hostname': '192.168.0.205', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器3'},
|
||||
|
||||
# {'hostname': '192.168.0.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器2'},
|
||||
# {'hostname': '192.168.0.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器3'},
|
||||
|
||||
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器1'},
|
||||
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器2'},
|
||||
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器3'},
|
||||
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器4'},
|
||||
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器5'},
|
||||
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'企业服务器6'},
|
||||
|
||||
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器1'},
|
||||
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器2'},
|
||||
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'监管服务器3'},
|
||||
|
||||
{'hostname': '1.94.106.10', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器1'},
|
||||
{'hostname': '1.94.182.142', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器2'},
|
||||
{'hostname': '119.3.153.192', 'username': 'root', 'password': 's6fQeVQmxxNv', 'host': '新增服务器3'},
|
||||
]
|
||||
|
||||
# 创建并启动线程
|
||||
threads = []
|
||||
for server in servers:
|
||||
thread = threading.Thread(target=thread_function, args=(server,))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
# 等待所有线程完成
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
print("All commands executed.")
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
|
||||
#FILES="/root/project/zzb_data_word/redis_service.py /root/project/zzb_data_word/zzb_logger.py /root/project/zzb_data_word/parse_word.py /root/project/zzb_data_word/config.py /root/project/zzb_data_word/utils.py /root/project/zzb_data_word/db_service_word.py /root/project/zzb_data_word/app_word.py /root/project/zzb_data_word/main_word.py /root/project/zzb_data_word/word_title.py"
|
||||
FILES="/root/project/zzb_data_word/parse_word.py"
|
||||
DEST_PATH="/root/pdf_parser/zzb_data_word"
|
||||
|
||||
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
|
||||
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
|
||||
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
|
||||
#监管服务器
|
||||
#SERVERS=("192.168.0.108" "192.168.0.131")
|
||||
#企业服务器
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
|
||||
#两者一起
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
|
||||
#测试
|
||||
#SERVERS=("192.168.0.103" "192.168.0.13")
|
||||
#1013生产(企业+监管)
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131" "192.168.0.205")
|
||||
# 生产更新
|
||||
SERVERS=("124.71.149.225" "1.94.143.23" "1.94.60.103" "124.71.157.162" "123.60.16.225" "1.94.101.237" "113.44.72.157" "113.44.52.221" "121.37.137.13")
|
||||
# 遍历每个服务器并上传文件
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo "Uploading files to $SERVER"
|
||||
scp -r $FILES root@$SERVER:$DEST_PATH
|
||||
echo "Finished uploading to $SERVER"
|
||||
done
|
||||
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
|
||||
#FILES="/root/project/zzb_data_word/redis_service.py /root/project/zzb_data_word/zzb_logger.py /root/project/zzb_data_word/parse_word.py /root/project/zzb_data_word/config.py /root/project/zzb_data_word/utils.py /root/project/zzb_data_word/db_service_word.py /root/project/zzb_data_word/app_word.py /root/project/zzb_data_word/main_word.py /root/project/zzb_data_word/word_title.py"
|
||||
FILES="/root/project/zzb_data_prod/pdf_company.py"
|
||||
DEST_PATH="/root/pdf_parser/zzb_data_prod"
|
||||
|
||||
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
|
||||
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
|
||||
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
|
||||
#监管服务器
|
||||
#SERVERS=("192.168.0.108" "192.168.0.131")
|
||||
#企业服务器
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
|
||||
#两者一起
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
|
||||
#测试
|
||||
#SERVERS=("192.168.0.103" "192.168.0.13")
|
||||
#1013生产(企业+监管)
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131" "192.168.0.205")
|
||||
# 生产更新
|
||||
SERVERS=("124.71.149.225" "1.94.143.23" "1.94.60.103" "124.71.157.162" "123.60.16.225" "1.94.101.237" "113.44.72.157" "113.44.52.221" "121.37.137.13" "1.94.106.10" "1.94.182.142" "119.3.153.192")
|
||||
# 遍历每个服务器并上传文件
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo "Uploading files to $SERVER"
|
||||
scp -r $FILES root@$SERVER:$DEST_PATH
|
||||
echo "Finished uploading to $SERVER"
|
||||
done
|
||||
|
||||
|
|
@ -0,0 +1,260 @@
|
|||
#coding=utf-8
|
||||
import sys,ast
|
||||
# from pdfminer.high_level import extract_text
|
||||
# from pdfminer.pdfparser import PDFParser
|
||||
# from pdfminer.pdfdocument import PDFDocument
|
||||
# from pdfminer.pdfpage import PDFPage
|
||||
import utils
|
||||
import mysql.connector
|
||||
# from pymilvus import connections,MilvusClient
|
||||
import json,time
|
||||
# import db_service
|
||||
import ast
|
||||
import numpy as np
|
||||
import config_p
|
||||
import redis_service
|
||||
from config_p import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
# import main
|
||||
import redis
|
||||
|
||||
def run_job(sec):
|
||||
time.sleep(sec)
|
||||
|
||||
def measure_config_to_db(conn,cursor):
|
||||
insert_query = '''
|
||||
INSERT INTO measure_config_half_year
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name,year)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
'''
|
||||
# 打开文本文件
|
||||
with open('measure_config_all.txt', 'r',encoding='utf-8') as file:
|
||||
# 读取所有行到一个列表中
|
||||
lines = file.readlines()
|
||||
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure, '2024')
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor):
|
||||
|
||||
# redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=6)
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config_half_year where year='2024'
|
||||
'''
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config where year='2023'
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
|
||||
# def contains_financial_indicators(text):
|
||||
# import re
|
||||
# # 正则表达式模式匹配千分位格式的数字和百分比
|
||||
# pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
|
||||
|
||||
# pattern1 = r"\d+(.\d+)+%?"
|
||||
# # 使用 re.search 函数查找匹配项
|
||||
# match = re.search(pattern1, text)
|
||||
|
||||
# # 如果找到匹配项,返回 True,否则返回 False
|
||||
# return bool(match)
|
||||
|
||||
# def get_clean_text(text):
|
||||
# import re
|
||||
# pattern = r"\([^)]*?\)"
|
||||
# matches = re.findall(pattern, text)
|
||||
# for match in matches:
|
||||
# # 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
|
||||
# month_keywords_found = re.search(r"归属于|扣非", match)
|
||||
# if not month_keywords_found:
|
||||
# # 如果包含,则从文本中删除该部分
|
||||
# text = re.sub(pattern,"", text)
|
||||
# else:
|
||||
# # 如果不包含,删除所有标点符号和中文数字
|
||||
# text = re.sub(r"[^\w\s]", "", text)
|
||||
# print(text)
|
||||
|
||||
# def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
|
||||
# # #通过向量查询指标
|
||||
# db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
# # #指标归一化处理
|
||||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
# def print_measure_data(cursor,client):
|
||||
# select_query = '''
|
||||
# SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
|
||||
# where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
|
||||
# '''
|
||||
# cursor.execute(select_query)
|
||||
# records = cursor.fetchall()
|
||||
# for record in records:
|
||||
# ori_measure_name = record[0]
|
||||
# measure_name = record[1]
|
||||
# ori_measure_id = record[2]
|
||||
# measure_vector = redis_service.read_from_redis(ori_measure_id)
|
||||
|
||||
# measure_list = ast.literal_eval(measure_vector)
|
||||
# data = [measure_list]
|
||||
# res = client.search(
|
||||
# collection_name="pdf_measure_v4", # Replace with the actual name of your collection
|
||||
# # Replace with your query vector
|
||||
# data=data,
|
||||
# limit=2, # Max. number of search results to return
|
||||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
# output_fields=["measure_name","measure_value","table_num","table_index"],
|
||||
# filter = 'file_id == "64"'
|
||||
# )
|
||||
# vector_str = measure_name+":"+ori_measure_name
|
||||
# # Convert the output to a formatted JSON string
|
||||
# for i in range(len(res[0])):
|
||||
|
||||
# vector_distance = float(res[0][i]["distance"])
|
||||
# vector_measure_name = res[0][i]["entity"]["measure_name"]
|
||||
# measure_value = res[0][i]["entity"]["measure_value"]
|
||||
# table_num = res[0][i]["entity"]["table_num"]
|
||||
# table_index = res[0][i]["entity"]["table_index"]
|
||||
# table_num_list = [106]
|
||||
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
|
||||
# # if vector_distance > 0.89 and table_num not in table_num_list:
|
||||
# # print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
|
||||
# # if vector_distance > distance and table_num not in table_num_list:
|
||||
# # print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||||
# vector = redis_service.read_from_redis(redis_client,'893301b0e4f1e07d16b4830fcdaea28a')
|
||||
# print(vector)
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# measure_config_to_db(conn,cursor)
|
||||
|
||||
insert_measure_vector(conn,cursor)
|
||||
|
||||
# cursor.close()
|
||||
# conn.close()
|
||||
# import re
|
||||
# text = '减少11.04百分点'
|
||||
# if re.match(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点', text):
|
||||
# print('找到了单位。')
|
||||
|
||||
# unit_pattern = re.compile(r'(增加|减少)[了]?(\d+\.\d+)[个]?百分点')
|
||||
|
||||
# match = unit_pattern.search(text)
|
||||
# print(len(match.groups()))
|
||||
|
||||
# if match:
|
||||
# print(f'找到单位。')
|
||||
# else:
|
||||
# print(f'没有找到单位。')
|
||||
# row1 = ['比例','比率','占比','费用']
|
||||
# row2 = ['同比增减','同比上升','同比下降','变化幅度','变动比例','本期比上年同期增减','本年比上年增减','同比变动','本期期末金额较上期期末变动比例']
|
||||
|
||||
# for i in range(len(row1)):
|
||||
# for j in range(len(row2)):
|
||||
# print(f"{row1[i]}{row2[j]}")
|
||||
# import os,re
|
||||
# file_path = '/projects/ai_chat/knowledge_base/ydkf/content/体育运动处方及应用_13925781.docx'
|
||||
|
||||
# # 获取文件名和扩展名
|
||||
# file_base_name, file_extension = os.path.splitext(os.path.basename(file_path))
|
||||
# file_base_name = file_base_name.replace("_", "").replace("\d+", "")
|
||||
# file_base_name = re.sub(r'\d+', '', file_base_name)
|
||||
# print(f'文件名: {file_base_name}')
|
||||
# import re
|
||||
# print(len(re.findall('母公司|现金流量表补充', '补充资料')))
|
||||
# import threading
|
||||
|
||||
# # 创建一个ThreadLocal变量
|
||||
# local_data = threading.local()
|
||||
|
||||
# # 定义一个线程执行的工作函数
|
||||
# def worker():
|
||||
# # 为当前线程的ThreadLocal变量设置一个值
|
||||
# local_data.data = f"Thread {threading.current_thread().name}'s data"
|
||||
# print(local_data.data)
|
||||
|
||||
# # 创建并启动多个线程
|
||||
# threads = []
|
||||
# for i in range(3):
|
||||
# thread = threading.Thread(target=worker)
|
||||
# thread.start()
|
||||
# threads.append(thread)
|
||||
|
||||
# # 等待所有线程完成
|
||||
# for thread in threads:
|
||||
# thread.join()
|
||||
# for i in range(2,5):
|
||||
# print(i)
|
||||
# file_url = 'http://static.cninfo.com.cn/finalpage/2023-04-11/1216368607.PDF'
|
||||
# file_path = utils.save_pdf_from_url(file_url, config.FILE_PATH)
|
||||
# redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||||
# print(redis_client.hget('measure_config', '2805fd5b7bfa960eb08312fa3d7c08'))
|
||||
# client = MilvusClient(
|
||||
# uri= MILVUS_CLIENT
|
||||
# )
|
||||
# conn = mysql.connector.connect(
|
||||
# host=MYSQL_HOST,
|
||||
# user=MYSQL_USER,
|
||||
# password=MYSQL_PASSWORD,
|
||||
# database=MYSQL_DB
|
||||
# )
|
||||
# cursor = conn.cursor()
|
||||
# print_measure_data(cursor,client)
|
||||
# redis_service.read_from_file_and_write_to_redis(conn,cursor)vim
|
||||
# redis_service.read_from_redis()
|
||||
# parent_table_pages = []
|
||||
# file_id = '67'
|
||||
# path = '/Users/zhengfei/Desktop/上汽车配/上汽车配_1.pdf'
|
||||
|
||||
# db_service.insert_table_measure_from_vector_test(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
# main.get_table_measure(path,'all',file_id)
|
||||
|
||||
# insert_and_update(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
|
||||
# measure_config_to_db(conn,cursor)
|
||||
# params = ['f_102','f_103',]
|
||||
# for param in params:
|
||||
# globals()[param] = param.replace('f_','')
|
||||
# # insert_measure_vector(conn,cursor)
|
||||
# print(globals()['f_102'])
|
||||
# db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
# conn.commit()
|
||||
# cursor.close()
|
||||
# conn.close()
|
||||
# # print(utils.get_md5('当期营业收入,2023年营业收入'))
|
||||
# count_range_parts = utils.get_range(2300)
|
||||
|
||||
# print(count_range_parts)
|
|
@ -0,0 +1,198 @@
|
|||
import pandas as pd
|
||||
import mysql.connector
|
||||
import utils
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
import redis_service
|
||||
import redis
|
||||
|
||||
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
|
||||
# 读取第一个 Excel 文件
|
||||
df = pd.read_excel(input_excel_path1, sheet_name='Sheet7', header=0)#对应ttt表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
# 连接到 MySQL 数据库
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 插入数据到 measure_create_config 表
|
||||
insert_query = '''
|
||||
INSERT INTO measure_create_config
|
||||
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
show_measure = str(data['指标'])
|
||||
same_mean_measure = str(data['同义表述'])
|
||||
period_measure = str(data['周期'])
|
||||
change_measure = str(data['变动'])
|
||||
black_list = str(data['黑名单词'])
|
||||
config_id = utils.get_md5(show_measure)
|
||||
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
|
||||
cursor.execute(insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 读取第二个 Excel 文件
|
||||
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet11', header=0)#对应周期表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
period_list = df_period.to_dict(orient='records')
|
||||
|
||||
# 插入数据到 measure_create_period 表
|
||||
period_insert_query = '''
|
||||
INSERT INTO measure_create_period
|
||||
(period_name, same_mean_period)
|
||||
VALUES (%s, %s)
|
||||
'''
|
||||
for data in period_list:
|
||||
period_name = str(data['标准表述'])
|
||||
same_mean_period = str(data['同义表述'])
|
||||
insert_query_data = (period_name, same_mean_period)
|
||||
cursor.execute(period_insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 查询数据库
|
||||
data_query = '''
|
||||
SELECT * FROM measure_create_config WHERE delete_status = 0
|
||||
'''
|
||||
period_query = '''
|
||||
SELECT * FROM measure_create_period
|
||||
'''
|
||||
|
||||
cursor.execute(data_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
cursor.execute(period_query)
|
||||
period_list = cursor.fetchall()
|
||||
|
||||
# 输出到文件
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
for data in data_list:
|
||||
config_id = data[0]
|
||||
show_measure = data[1]
|
||||
same_mean_measure = data[2]
|
||||
period_measure = data[3]
|
||||
change_measure = data[4]
|
||||
same_mean_measure_arr = []
|
||||
period_measure_arr = []
|
||||
change_measure_arr = []
|
||||
|
||||
if same_mean_measure != 'nan':
|
||||
same_mean_measure_arr = same_mean_measure.split(',')
|
||||
same_mean_measure_arr.append(show_measure)
|
||||
if period_measure != 'nan':
|
||||
period_measure_arr = period_measure.split(',')
|
||||
if change_measure != 'nan':
|
||||
change_measure_arr = change_measure.split(',')
|
||||
|
||||
for c in change_measure_arr:
|
||||
period_measure_arr.append(c)
|
||||
|
||||
for x in period_measure_arr:
|
||||
if x in change_measure_arr:
|
||||
show_name = show_measure + x
|
||||
else:
|
||||
show_name = x + show_measure
|
||||
for y in same_mean_measure_arr:
|
||||
if x in change_measure:
|
||||
parser_name = y + x
|
||||
else:
|
||||
parser_name = x + y
|
||||
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
for p in period_list:
|
||||
period_exra_name = p[0]
|
||||
period_exra_value = p[1]
|
||||
if period_exra_name in x:
|
||||
for v in period_exra_value.split(','):
|
||||
if x in change_measure:
|
||||
parser_name = y + x.replace(period_exra_name, v)
|
||||
else:
|
||||
parser_name = x.replace(period_exra_name, v) + y
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def measure_config_to_db(conn, cursor, file_path):
|
||||
insert_query = '''
|
||||
INSERT INTO measure_config_third_quarter
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
'''
|
||||
check_query = '''
|
||||
SELECT ori_measure_id FROM measure_config_third_quarter
|
||||
'''
|
||||
|
||||
# 打开文本文件
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
# 读取所有行到一个列表中
|
||||
lines = file.readlines()
|
||||
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
|
||||
# 判断数据库中是否有数据
|
||||
cursor.execute(check_query)
|
||||
check_records = cursor.fetchall()
|
||||
#if any(record[0] == ori_measure_id for record in check_records):
|
||||
# continue
|
||||
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor):
|
||||
|
||||
redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)# 192.168.0.172 #测试123.60.153.169
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config_1024
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
if __name__ == "__main__":
|
||||
MYSQL_HOST = '121.37.185.246'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial'
|
||||
MYSQL_PASSWORD = 'financial_8000'
|
||||
MYSQL_DB = 'financial_report'
|
||||
# 需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
|
||||
|
||||
process_excel_and_db(
|
||||
'ttt_1.xlsx',#ttt文件
|
||||
'period_1.xlsx',#period文件
|
||||
'out_2022_new_year.txt'#输出文件
|
||||
)
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
file_path = 'out_2022_new_year.txt'
|
||||
measure_config_to_db(conn, cursor, file_path)
|
||||
insert_measure_vector(conn,cursor)
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
import redis
|
||||
# 从 MySQL 表中读取数据并写入 Redis
|
||||
def read_from_file_and_write_to_redis(redis_client,ori_measure_id,measure_vector):
|
||||
# Redis 连接配置
|
||||
redis_client.hset('measure_config',ori_measure_id, measure_vector)
|
||||
|
||||
# 从 Redis 中读取数据
|
||||
def read_from_redis(redis_client,ori_measure_id):
|
||||
# 获取所有键
|
||||
return redis_client.hget('measure_config',ori_measure_id).decode()
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# # redis_client = redis.Redis(host='123.60.153.169', port=6379, password='Xgf_redis', db=6)
|
||||
# redis_client = redis.Redis(host='124.70.129.232', port=6379, password='Xgf_redis', db=6)
|
||||
#
|
||||
# value = read_from_redis(redis_client,"92b44ffb50b6ab2068f5de447c9925")
|
||||
# print(value)
|
|
@ -0,0 +1,82 @@
|
|||
import redis
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
|
||||
def migrate_redis(source_host, source_port, source_password, target_host, target_port, target_password):
|
||||
try:
|
||||
# 连接源 Redis
|
||||
source_redis = redis.StrictRedis(host=source_host, port=source_port, password=source_password,
|
||||
decode_responses=True)
|
||||
|
||||
# 连接目标 Redis
|
||||
target_redis = redis.StrictRedis(host=target_host, port=target_port, password=target_password,
|
||||
decode_responses=True)
|
||||
|
||||
# 获取源 Redis 的数据库数量
|
||||
db_count = int(source_redis.config_get('databases')['databases'])
|
||||
logging.info(f"Total databases in source Redis: {db_count}")
|
||||
|
||||
# 遍历每个数据库
|
||||
for db in range(db_count):
|
||||
try:
|
||||
# 切换到当前数据库
|
||||
source_redis.select(db)
|
||||
target_redis.select(db)
|
||||
logging.info(f"Migrating data from DB {db}")
|
||||
|
||||
# 创建 pipeline
|
||||
pipeline = target_redis.pipeline()
|
||||
|
||||
# 遍历当前数据库中的所有键
|
||||
for key in source_redis.scan_iter():
|
||||
try:
|
||||
key_type = source_redis.type(key) # 获取键的类型
|
||||
logging.info(f"Migrating key: {key} (Type: {key_type}) in DB {db}")
|
||||
|
||||
# 根据键的类型处理数据
|
||||
if key_type == 'string':
|
||||
value = source_redis.get(key)
|
||||
pipeline.set(key, value)
|
||||
elif key_type == 'hash':
|
||||
hash_data = source_redis.hgetall(key)
|
||||
pipeline.hset(key, mapping=hash_data) # 使用 hset 替代 hmset
|
||||
elif key_type == 'list':
|
||||
list_data = source_redis.lrange(key, 0, -1)
|
||||
pipeline.rpush(key, *list_data)
|
||||
elif key_type == 'set':
|
||||
set_data = source_redis.smembers(key)
|
||||
pipeline.sadd(key, *set_data)
|
||||
elif key_type == 'zset':
|
||||
zset_data = source_redis.zrange(key, 0, -1, withscores=True)
|
||||
for member, score in zset_data:
|
||||
pipeline.zadd(key, {member: score})
|
||||
else:
|
||||
logging.warning(f"Unsupported key type: {key_type} for key: {key} in DB {db}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to migrate key: {key} in DB {db}. Error: {e}")
|
||||
|
||||
# 批量执行 pipeline
|
||||
pipeline.execute()
|
||||
logging.info(f"Migration completed for DB {db}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to migrate DB {db}. Error: {e}")
|
||||
|
||||
logging.info("All databases migrated successfully!")
|
||||
except Exception as e:
|
||||
logging.error(f"Migration failed. Error: {e}")
|
||||
|
||||
|
||||
# 配置源 Redis 和目标 Redis 的连接信息
|
||||
source_host = '10.127.2.206'
|
||||
source_port = 6379
|
||||
source_password = "Xgf_redis"
|
||||
|
||||
target_host = '10.127.2.209'
|
||||
target_port = 6379
|
||||
target_password = "dMrt4kmwiW6LDJXy"
|
||||
|
||||
# 执行迁移
|
||||
migrate_redis(source_host, source_port, source_password, target_host, target_port, target_password)
|
|
@ -0,0 +1,14 @@
|
|||
camelot-py==0.11.0
|
||||
pdfminer.six==20221105
|
||||
PyPDF2==3.0.1
|
||||
pdfplumber==0.10.3
|
||||
pymilvus==2.3.3
|
||||
mysql-connector-python==8.3.0
|
||||
dashscope==1.17.0
|
||||
fastapi
|
||||
pydantic
|
||||
uvicorn
|
||||
redis
|
||||
ghostscript
|
||||
opencv-python-headless
|
||||
python-docx
|
|
@ -0,0 +1,63 @@
|
|||
import pandas as pd
|
||||
import json
|
||||
import utils
|
||||
import mysql.connector
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
host = 'rm-bp1vns6jjy6yu46lhio.mysql.rds.aliyuncs.com',
|
||||
user = 'hematiyu',
|
||||
password = '00a09f971769499f8c0495505ab0922C',
|
||||
database = 'ai_chat_mgmt_test'
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor()
|
||||
|
||||
excel_file_path = '/Users/zhengfei/Desktop/healthy_book.xlsx'
|
||||
|
||||
# 读取Excel文件
|
||||
xls = pd.ExcelFile(excel_file_path)
|
||||
|
||||
# 遍历每个sheet
|
||||
for sheet_name in xls.sheet_names:
|
||||
# 读取sheet内容
|
||||
df = pd.read_excel(xls, sheet_name, header=0)
|
||||
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
insert_query = '''
|
||||
INSERT INTO ai_chat_book_info
|
||||
(name, publish, author, isbn, pub_time, word_flag, category, keywords)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
|
||||
for data in data_list:
|
||||
name = str(data['书名'])
|
||||
if name == 'nan':
|
||||
continue
|
||||
publish = str(data['出版单位'])
|
||||
author = str(data['作者']).replace('[', '')
|
||||
isbn = str(data['ISBN'])
|
||||
pub_time = str(data['年份'])
|
||||
if pub_time == 'NaT':
|
||||
pub_time = ''
|
||||
else:
|
||||
#提取前四位
|
||||
pub_time = pub_time[:4]
|
||||
word_flag = str(data['是否转换为word格式'])
|
||||
if word_flag == 'nan':
|
||||
word_flag = ''
|
||||
category = str(data['分类'])
|
||||
if category == 'nan':
|
||||
category = ''
|
||||
keywords = str(data['关键词'])
|
||||
if keywords == 'nan':
|
||||
keywords = ''
|
||||
insert_query_data = (name, publish, author, isbn, pub_time, word_flag, category, keywords)
|
||||
cursor.execute(insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
|
@ -0,0 +1,156 @@
|
|||
import pymssql
|
||||
import mysql.connector
|
||||
import logging
|
||||
from multiprocessing import Pool
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# SQL Server配置
|
||||
sql_server_config = {
|
||||
"server": "203.192.15.17", # SQL Server 的 IP 地址
|
||||
"port": 28063, # SQL Server 的端口
|
||||
"user": "zncbuser", # 用户名
|
||||
"password": "ZZB-Cbindex-data", # 密码
|
||||
"database": "jydb", # 数据库名称
|
||||
}
|
||||
|
||||
# MySQL配置
|
||||
mysql_config = {
|
||||
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com", # MySQL 的 IP 地址
|
||||
"user": "zzb_jydb", # 用户名
|
||||
"password": "Ysdbsdjs89Yrqwp", # 密码
|
||||
"database": "zzb_jydb", # 数据库名称
|
||||
}
|
||||
|
||||
# 最大进程数
|
||||
MAX_PROCESSES = 1
|
||||
|
||||
def sync_table_structure(table_name):
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(
|
||||
server=sql_server_config["server"],
|
||||
port=sql_server_config["port"],
|
||||
user=sql_server_config["user"],
|
||||
password=sql_server_config["password"],
|
||||
database=sql_server_config["database"],
|
||||
)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 连接到MySQL
|
||||
mysql_conn = mysql.connector.connect(**mysql_config)
|
||||
mysql_cursor = mysql_conn.cursor()
|
||||
|
||||
logging.info(f"Processing table: {table_name}")
|
||||
|
||||
# 检查MySQL中是否已存在该表
|
||||
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
|
||||
table_exists = mysql_cursor.fetchone()
|
||||
|
||||
if not table_exists:
|
||||
# 如果表不存在,创建表
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT
|
||||
COLUMN_NAME,
|
||||
DATA_TYPE,
|
||||
CHARACTER_MAXIMUM_LENGTH,
|
||||
NUMERIC_PRECISION,
|
||||
NUMERIC_SCALE
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_NAME = '{table_name}'
|
||||
""")
|
||||
columns = sql_server_cursor.fetchall()
|
||||
|
||||
# 生成MySQL的CREATE TABLE语句
|
||||
create_table_sql = f"CREATE TABLE {table_name} ("
|
||||
for col in columns:
|
||||
col_name = col[0]
|
||||
col_type = col[1]
|
||||
|
||||
# 获取字段长度
|
||||
char_length = col[2]
|
||||
numeric_precision = col[3]
|
||||
numeric_scale = col[4]
|
||||
|
||||
# 简单类型映射(可能需要根据实际情况调整)
|
||||
if col_type == "varchar":
|
||||
col_type = "VARCHAR(255)"
|
||||
elif col_type == "int":
|
||||
col_type = "INT"
|
||||
elif col_type == "datetime":
|
||||
col_type = "DATETIME"
|
||||
elif col_type == "decimal":
|
||||
if numeric_precision and numeric_scale:
|
||||
col_type = f"DECIMAL({numeric_precision}, {numeric_scale})"
|
||||
else:
|
||||
col_type = "DECIMAL(10, 2)" # 默认值
|
||||
elif col_type == "money":
|
||||
col_type = "DECIMAL(19, 4)"
|
||||
elif col_type == "smallmoney":
|
||||
col_type = "DECIMAL(19, 4)"
|
||||
elif col_type == "image":
|
||||
col_type = "LONGBLOB"
|
||||
|
||||
|
||||
# 设置列的 NULL 属性
|
||||
if col_name.lower() == "id":
|
||||
# ID 列不允许 NULL
|
||||
create_table_sql += f"`{col_name}` {col_type} NOT NULL, "
|
||||
else:
|
||||
# 其他列允许 NULL
|
||||
create_table_sql += f"`{col_name}` {col_type} , "
|
||||
|
||||
# 添加主键约束(假设 ID 是主键)
|
||||
create_table_sql = create_table_sql.rstrip(", ") + f", PRIMARY KEY ({columns[0][0]}))"
|
||||
logging.info(f"Create table SQL: {create_table_sql}")
|
||||
|
||||
# 在MySQL中创建表
|
||||
mysql_cursor.execute(create_table_sql)
|
||||
logging.info(f"Table {table_name} created in MySQL.")
|
||||
else:
|
||||
logging.info(f"Table {table_name} already exists in MySQL. Skipping...")
|
||||
|
||||
# 关闭连接
|
||||
sql_server_cursor.close()
|
||||
sql_server_conn.close()
|
||||
mysql_cursor.close()
|
||||
mysql_conn.close()
|
||||
|
||||
logging.info(f"Sync completed for table: {table_name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to sync table {table_name}. Error: {e}")
|
||||
|
||||
def main():
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(
|
||||
server=sql_server_config["server"],
|
||||
port=sql_server_config["port"],
|
||||
user=sql_server_config["user"],
|
||||
password=sql_server_config["password"],
|
||||
database=sql_server_config["database"],
|
||||
)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 获取SQL Server中的所有表
|
||||
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
|
||||
tables = sql_server_cursor.fetchall()
|
||||
|
||||
# 使用进程池并发处理每个表
|
||||
with Pool(processes=MAX_PROCESSES) as pool:
|
||||
pool.map(sync_table_structure, [table[0] for table in tables])
|
||||
|
||||
logging.info("All tables synced successfully!")
|
||||
except Exception as e:
|
||||
logging.error(f"Main function failed. Error: {e}")
|
||||
finally:
|
||||
# 关闭连接
|
||||
if 'sql_server_cursor' in locals():
|
||||
sql_server_cursor.close()
|
||||
if 'sql_server_conn' in locals():
|
||||
sql_server_conn.close()
|
||||
|
||||
# 启动主函数
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,32 @@
|
|||
Run task text (35857)...
|
||||
{'file_id': '5555', 'unit': '万元', 'page_num': 5, 'table_index': 2}
|
||||
{'file_id': '5555', 'unit': '万元', 'page_num': 6, 'table_index': 1}
|
||||
{'file_id': '5555', 'unit': '万元', 'page_num': 6, 'table_index': 2}
|
||||
{'file_id': '5555', 'unit': '万元', 'page_num': 8, 'table_index': 2}
|
||||
Task text runs 25.38 seconds.
|
||||
{'top': 143.97104000000002, 'buttom': 133.41104, 'page_num': 2, 'type': 'text', 'content': '□适用 √不适用 ', 'sort_num': 1856.02896}
|
||||
{'top': 197.01104, 'buttom': 110.61103999999999, 'page_num': 4, 'type': 'text', 'content': '公司注册地址 公司注册地址的历史变更情况 公司办公地址 公司办公地址的邮政编码 公司网址 电子信箱 ', 'sort_num': 3802.98896}
|
||||
{'top': 196.41104, 'buttom': 110.13104, 'page_num': 4, 'type': 'text', 'content': '北京市海淀区杏石口路甲18号航天信息园 无 北京市海淀区杏石口路甲18号航天信息园 100195 http://www.aisino.com stock@aisino.com ', 'sort_num': 3803.58896}
|
||||
{'top': 555.1410400000001, 'buttom': 530.90104, 'page_num': 5, 'table_index': 2, 'type': 'page_footer', 'content': '公司聘请的会计师事务所(境内) ', 'sort_num': 4444.85896}
|
||||
{'top': 503.42104, 'buttom': 475.15000000000003, 'page_num': 5, 'table_index': 2, 'type': 'page_footer', 'content': '七、 近三年主要会计数据和财务指标 (一) 主要会计数据 ', 'sort_num': 4496.57896}
|
||||
{'top': 470.18104, 'buttom': 459.62104, 'page_num': 5, 'table_index': 2, 'type': 'page_footer', 'content': '单位:万元 币种:人民币 ', 'sort_num': 4529.81896}
|
||||
{'top': 458.15999999999997, 'buttom': 273.36, 'page_num': 5, 'table_index': 2, 'type': 'table', 'data': [['主要会计数据', '2023年', '2022年', '本期比上年同期增减(%)', '2021年'], ['营业收入', '1257482.20', '1931407.52', '-34.89', '2351554.42'], ['归属于上市公司股东的净利润', '20271.53', '107841.40', '-81.20', '102357.97'], ['归属于上市公司股东的扣除非经常性损益的净利润', '2704.30', '94848.41', '-97.15', '134689.72'], ['经营活动产生的现金流量净额', '87498.48', '186388.43', '-53.06', '187453.86'], ['', '2023年末', '2022年末', '本期末比上年同期末增减(%)', '2021年末'], ['归属于上市公司股东的净资产', '1404971.01', '1414749.10', '-0.69', '1334971.44'], ['总资产', '2269076.18', '2370768.08', '-4.29', '2283286.86']], 'sort_num': 4541.84}
|
||||
{'top': 241.68104000000002, 'buttom': 230.42408, 'page_num': 5, 'table_index': 3, 'type': 'page_footer', 'content': '(二) 主要财务指标 ', 'sort_num': 4758.31896}
|
||||
{'top': 226.79999999999998, 'buttom': 86.88, 'page_num': 5, 'table_index': 3, 'type': 'table', 'data': [['主要财务指标', '2023年', '2022年', '本期比上年同期增减(%)', '2021年'], ['基本每股收益(元/股)', '0.11', '0.58', '-81.03', '0.55'], ['稀释每股收益(元/股)', '0.11', '0.58', '-81.03', '0.55'], ['扣除非经常性损益后的基本每股收益(元/股)', '0.01', '0.51', '-98.04', '0.73'], ['加权平均净资产收益率(%)', '1.45', '7.90', '下降6.45个百分点', '7.95'], ['扣除非经常性损益后的加权平均净资产收益率(%)', '0.19', '6.94', '下降6.75个百分点', '10.46']], 'sort_num': 4773.2}
|
||||
{'top': 473.78103999999996, 'buttom': 446.66103999999996, 'page_num': 6, 'table_index': 1, 'type': 'page_footer', 'content': '(三) 境内外会计准则差异的说明: □适用 √不适用 ', 'sort_num': 5526.21896}
|
||||
{'top': 426.98104, 'buttom': 415.6924, 'page_num': 6, 'table_index': 1, 'type': 'page_footer', 'content': '九、 2023 年分季度主要财务数据 ', 'sort_num': 5573.01896}
|
||||
{'top': 410.30104, 'buttom': 399.74104, 'page_num': 6, 'table_index': 1, 'type': 'page_footer', 'content': '单位:万元 币种:人民币 ', 'sort_num': 5589.69896}
|
||||
{'top': 398.4, 'buttom': 258.96, 'page_num': 6, 'table_index': 1, 'type': 'table', 'data': [['', '第一季度(1-3月份)', '第二季度(4-6月份)', '第三季度(7-9月份)', '第四季度(10-12月份)'], ['营业收入', '350592.33', '347900.54', '305553.10', '253436.23'], ['归属于上市公司股东的净利润', '24569.40', '28044.65', '-6452.14', '-25890.38'], ['归属于上市公司股东的扣除非经常性损益后的净利润', '-8313.07', '16084.23', '2564.63', '-7631.49'], ['经营活动产生的现金流量净额', '-116544.51', '51455.38', '-40594.82', '193182.43']], 'sort_num': 5601.6}
|
||||
{'top': 243.84104000000002, 'buttom': 219.72104000000002, 'page_num': 6, 'table_index': 2, 'type': 'page_footer', 'content': '季度数据与已披露定期报告数据差异说明 □适用 √不适用 ', 'sort_num': 5756.15896}
|
||||
{'top': 200.01104, 'buttom': 172.89104, 'page_num': 6, 'table_index': 2, 'type': 'page_footer', 'content': '十、 非经常性损益项目和金额 √适用 □不适用 ', 'sort_num': 5799.98896}
|
||||
{'top': 169.77104, 'buttom': 159.21104, 'page_num': 6, 'table_index': 2, 'type': 'page_footer', 'content': '单位:万元 币种:人民币 ', 'sort_num': 5830.22896}
|
||||
{'top': 157.92, 'buttom': 101.52, 'page_num': 6, 'table_index': 2, 'type': 'table', 'data': [['非经常性损益项目', '2023年金额', '附注(如适用)', '2022年金额', '2021年金额'], ['非流动性资产处置损益,包括已计提资产减值准备的冲销部分', '600.11', '-', '224.25', '814.45']], 'sort_num': 5842.08}
|
||||
{'top': 765.8399999999999, 'buttom': 87.84, 'page_num': 7, 'table_index': 1, 'type': 'table', 'data': [['非经常性损益项目', '2023年金额', '附注(如适用)', '2022年金额', '2021年金额'], ['计入当期损益的政府补助,但与公司正常经营业务密切相关、符合国家政策规定、按照确定的标准享有、对公司损益产生持续影响的政府补助除外', '7847.97', '-', '12602.14', '12861.57'], ['除同公司正常经营业务相关的有效套期保值业务外,非金融企业持有金融资产和金融负债产生的公允价值变动损益以及处置金融资产和金融负债产生的损益', '18586.19', '主要是公司持有的中油资本股票处置收益以及公允价值变动损益。', '6233.14', '-40552.70'], ['计入当期损益的对非金融企业收取的资金占用费', '', '', '', ''], ['委托他人投资或管理资产的损益', '', '', '', ''], ['对外委托贷款取得的损益', '', '', '', ''], ['因不可抗力因素,如遭受自然灾害而产生的各项资产损失', '', '', '', ''], ['单独进行减值测试的应收款项减值准备转回', '1391.58', '', '', ''], ['企业取得子公司、联营企业及合营企业的投资成本小于取得投资时应享有被投资单位可辨认净资产公允价值产生的收益', '', '', '', ''], ['同一控制下企业合并产生的子公司期初至合并日的当期净损益', '', '', '', ''], ['非货币性资产交换损益', '', '', '', ''], ['债务重组损益', '', '', '', ''], ['企业因相关经营活动不再持续而发生的一次性费用,如安置职工的支出等', '', '', '', ''], ['因税收、会计等法律、法规的调整对当期损益产生的一次性影响', '', '', '', ''], ['因取消、修改股权激励计划一次性确认的股份支付费用', '', '', '', ''], ['对于现金结算的股份支付,在可行权日之后,应付职工的公允价值变动产生的损益', '', '', '', ''], ['采用公允价值模式进行后续计量的投资性房地产公允价值变动产生的损益', '', '', '', ''], ['交易价格显失公允的交易产生的收益', '', '', '', ''], ['与公司正常经营业务无关的或有事项产生的损益', '', '', '', ''], ['受托经营取得的托管费收入', '', '', '', '']], 'sort_num': 6234.16}
|
||||
{'top': 765.8399999999999, 'buttom': 625.68, 'page_num': 8, 'table_index': 1, 'type': 'table', 'data': [['非经常性损益项目', '2023年金额', '附注(如适用)', '2022年金额', '2021年金额'], ['除上述各项之外的其他营业外收入和支出', '-6149.80', '-', '-1777.00', '1315.87'], ['其他符合非经常性损益定义的损益项目', '220.56', '-', '243.50', '226.16'], ['减:所得税影响额', '3750.45', '-', '1383.14', '2257.64'], ['少数股东权益影响额(税后)', '1178.93', '-', '3149.90', '4739.46'], ['合计', '17567.23', '', '12992.99', '-32331.75']], 'sort_num': 7234.16}
|
||||
{'top': 539.3010400000001, 'buttom': 512.1810399999999, 'page_num': 8, 'table_index': 2, 'type': 'page_header', 'content': '十一、 采用公允价值计量的项目 √适用 □不适用 ', 'sort_num': 7460.69896}
|
||||
{'top': 509.06104, 'buttom': 470.78103999999996, 'page_num': 8, 'table_index': 2, 'type': 'page_header', 'content': '单位:万元 币种:人民币 对当期利润的影响金额 ', 'sort_num': 7490.93896}
|
||||
{'top': 497.03999999999996, 'buttom': 385.44, 'page_num': 8, 'table_index': 2, 'type': 'table', 'data': [['项目名称', '期初余额', '期末余额', '当期变动', '对当期利润的影响金额'], ['以公允价值计量且其变动计入当期损益的金融资产', '164526.24', '91132.43', '-73393.81', '20153.57'], ['指定为以公允价值计量且其变动计入其他综合收益的金融资产', '30864.33', '40186.93', '9322.60', '1078.00'], ['合计', '195390.57', '131319.36', '-64071.21', '21231.57']], 'sort_num': 7502.96}
|
||||
{'top': 161.73104, 'buttom': 82.34304, 'page_num': 8, 'type': 'text', 'content': '改革作为推动转型升级的“关键一招”,不断激发动力活力。一是围绕新时代国资央企“三个总”“三个作用”和新一轮国企改革“三个明显成效”要求,系统研究形成“科改行动”和改革深化提升行动实施方案(2023-2025 年)及工作台账,全面完成各项年度改革任务;二是加大改革力度、保持改革节奏,推动重要改革举措深化扩围,中长期激励工作成为航天科工集团先进典型;三是聚焦重点环节持续深化三项制度改革,压紧压实“一岗一表”差异化考核责任压力,经理层绩效年薪占年度薪酬比例 61.8%,管理人员不胜任退出率 14.3%,均达到央企优秀水平。进一', 'sort_num': 7838.26896}
|
||||
{'top': 134.73104, 'buttom': 123.47408, 'page_num': 9, 'type': 'text', 'content': '二、报告期内公司所处行业情况 ', 'sort_num': 8865.26896}
|
||||
{'top': 118.05104, 'buttom': 80.25504, 'page_num': 9, 'type': 'text', 'content': '国家高度重视培育数字经济、构建数字社会,数字中国、网络强国等战略从实践探索阶段发展至国家统筹策划、科学实施阶段。粮食安全、农业强国、乡村振兴等国家战略的实施,以及深化税收征管改革等国家级重大部署中,均明确了顺应数字经济发展规律、加大信息技术创新应用', 'sort_num': 8881.94896}
|
||||
{'top': 207.45104, 'buttom': 87.93504, 'page_num': 10, 'type': 'text', 'content': '航天信息公司作为航天科工集团控股企业、以信息安全技术为核心的国有科技型上市公司,自成立以来,坚持服务国家战略、服务国计民生,依托航天的技术优势、人才优势,加快完善中国特色现代企业制度,有效提升公司治理水平,切实增强企业改革发展活力,扎实推动企业高质量发展。经过 20 余年的发展壮大,逐渐成为行业内具有一定影响力的上市公司,核心竞争力主要表现在以下九个方面:有清晰的战略定位和明确的发展目标,有完善的技术与产品体系,有“科改示范企业”的专项改革政策,有建设世界一流专业领军企业的综合实力,有千万级的庞大用户群体,有国家和行业的完备顶级资质,有充裕的现金资产与强大的融资能力,有遍布全国的营销售后服务体系,有央企背景和航天品牌提供的丰沛资源与信用背书,有一支想干事、能干事、干成事的干部职工队伍。 ', 'sort_num': 9792.54896}
|
|
@ -0,0 +1,22 @@
|
|||
"","","适用(如)","",""
|
||||
"非流动性资产处置损益,包括已计提资产减值准备的冲销部分","-236316.65","","232448.97","-46760.24"
|
||||
"计入当期损益的政府补助,但与公司正常经营业务密切相关、符合国家政策规定、按照确定的标准享有、对公司损益产生持续影响的政府补助除外","4471155.00","","9188174.79","13052067.83"
|
||||
"除同公司正常经营业务相关的有效套期保值业务外,非金融企业持有金融资产和金融负债产生的公允价值变动损益以及处置金融资产和金融负债产生的损益","13099776.76","","14132376.82","7256455.55"
|
||||
"计入当期损益的对非金融企业收取的资金占用费","","","",""
|
||||
"委托他人投资或管理资产的损益","","","",""
|
||||
"对外委托贷款取得的损益","","","",""
|
||||
"因不可抗力因素,如遭受自然灾害而产生的各项资产损失","-3826330.90","","",""
|
||||
"单独进行减值测试的应收款项减值准备转回","","","",""
|
||||
"企业取得子公司、联营企业及合营企业的投资成本小于取得投资时应享有被投资单位可辨认净资产公允价值产生的收益","","","",""
|
||||
"同一控制下企业合并产生的子公司期初至合并日的当期净损益","","","",""
|
||||
"非货币性资产交换损益","","","",""
|
||||
"债务重组损益","","","",""
|
||||
"企业因相关经营活动不再持续而发生的一次性费用,如安置职工的支出等","","","",""
|
||||
"因税收、会计等法律、法规的调整对当期损益产生的一次性影响","","","",""
|
||||
"因取消、修改股权激励计划一次性确认的股份支付费用","","","",""
|
||||
"对于现金结算的股份支付,在可行权日之后,应付职工薪酬的公允价值变动产生的损益","","","",""
|
||||
"采用公允价值模式进行后续计量的投资性房地产公允价值变动产生的损益","","","",""
|
||||
"交易价格显失公允的交易产生的收益","","","",""
|
||||
"与公司正常经营业务无关的或有事项产生的损益","","","",""
|
||||
"受托经营取得的托管费收入","","","",""
|
||||
"除上述各项之外的其他营业外收","-11648682.96","","-529596.32","34351.19"
|
|
|
@ -0,0 +1,5 @@
|
|||
"入和支出","","","",""
|
||||
"其他符合非经常性损益定义的损益项目","","","-757389.60","-729432.00"
|
||||
"减:所得税影响额","278940.19","","3339902.20","2935002.34"
|
||||
"少数股东权益影响额(税后)","","","",""
|
||||
"合计","1580661.06","","18926112.46","16631679.99"
|
|
|
@ -0,0 +1,3 @@
|
|||
"项目名称","期初余额","期末余额","当期变动","对当期利润的影响金额"
|
||||
"交易性金融资产","390568609.77","175421746.58","-215146863.19","-146863.19"
|
||||
"合计","390568609.77","175421746.58","-215146863.19","-146863.19"
|
|
|
@ -0,0 +1 @@
|
|||
"","","","","","","","","","",""
|
|
|
@ -0,0 +1,5 @@
|
|||
"序号","评价维度","指标","公司产品注册标准","2020版中国药典标准","欧洲药典9.0版标准"
|
||||
"123","杂质含量","卵清蛋白含量","≤60ng/mL","≤200ng/mL","≤500ng/mL"
|
||||
"","","蛋白质含量","≤360μg/mL","≤400μg/mL","≤600μg/mL"
|
||||
"","","游离甲醛含量","≤25μg/mL","≤50μg/mL","≤200μg/mL"
|
||||
"4","有效成分纯度","蛋白质含量/血凝素含量","≤3.0","≤4.5","≤6.0"
|
|
|
@ -0,0 +1,8 @@
|
|||
"","本年新增","本年新增","累计数量","累计数量"
|
||||
"","申请数(个)","获得数(个)","申请数(个)","获得数(个)"
|
||||
"发明专利","6","3","16","6"
|
||||
"实用新型专利","2","","12","10"
|
||||
"外观设计专利","","","",""
|
||||
"软件著作权","","","",""
|
||||
"其他","","","",""
|
||||
"合计","8","3","28","16"
|
|
|
@ -0,0 +1,6 @@
|
|||
"","本年度","上年度","变化幅度(%)"
|
||||
"费用化研发投入","15471820.82","32409476.90","-52.26"
|
||||
"资本化研发投入","15990870.05","13732758.96","16.44"
|
||||
"研发投入合计","31462690.87","46142235.86","-31.81"
|
||||
"研发投入总额占营业收入比例(%)","23.38","14.49","增加8.89个百分点"
|
||||
"研发投入资本化的比重(%)","50.82","29.76","增加21.06个百分点"
|
|
|
@ -0,0 +1,12 @@
|
|||
"","","资规模","金额","金额","阶段性成果","到目标","水平","应用前景"
|
||||
"1","冻干人用狂犬病疫苗(Vero细胞)","10000.00","1599.09","11578.76","注册申请中","获得生产批件","国内领先","用于预防狂犬病"
|
||||
"2","四价流感病毒裂解疫苗(儿童)","33000.00","410.69","1481.50","III期临床试验前期准备中","获得生产批件","国内领先","用于预防流行性感冒"
|
||||
"3","23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","22980.00","123.49","631.25","临床前研究","获得生产批件","国内领先","用于预防肺炎"
|
||||
"4","冻干水痘减毒活疫苗","31975.00","225.03","946.69","临床前研究","获得生产批件","国内领先","用于预防水痘"
|
||||
"5","四价流感病毒裂解疫苗(高剂量)","11745.00","110.64","1961.90","临床前研究","获得生产批件","国内领先","用于预防流行性感冒"
|
||||
"6","重组带状疱疹疫苗","31975.00","168.99","429.68","临床前研究","获得生产批件","国内领先","用于预防带状疱疹"
|
||||
"7","冻干人用狂犬病疫苗(MRC-5细胞)","27915.00","33.77","200.46","临床前研究","获得生产批件","国内领先","用于预防狂犬病"
|
||||
"8","多价手足口病疫苗","29910.00","33.77","199.29","临床前研究","获得生产批件","国内领先","用于预防手足口病"
|
||||
"9","注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","38910.00","33.49","350.71","临床前研究","获得生产批件","新药","实体瘤治疗"
|
||||
"10","在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究","300.00","54.38","225.80","临床研究完成","获得注册批件","国内领先","预防流行性感冒"
|
||||
"合计","/","238710.00","2793.34","18006.04","/","/","/","/"
|
|
|
@ -0,0 +1,6 @@
|
|||
"基本情况","基本情况","基本情况"
|
||||
"","本期数","上期数"
|
||||
"公司研发人员的数量(人)","60","58"
|
||||
"研发人员数量占公司总人数的比例(%)","13.10","12.24"
|
||||
"研发人员薪酬合计","1012.67","932.12"
|
||||
"研发人员平均薪酬","16.88","16.07"
|
|
|
@ -0,0 +1,14 @@
|
|||
"研发人员学历结构","研发人员学历结构"
|
||||
"学历结构类别","学历结构人数"
|
||||
"博士研究生","3"
|
||||
"硕士研究生","6"
|
||||
"本科","40"
|
||||
"专科","10"
|
||||
"高中及以下","1"
|
||||
"研发人员年龄结构","研发人员年龄结构"
|
||||
"年龄结构类别","年龄结构人数"
|
||||
"30岁以下(不含30岁)","29"
|
||||
"30-40岁(含30岁,不含40岁)","20"
|
||||
"40-50岁(含40岁,不含50岁)","5"
|
||||
"50-60岁(含50岁,不含60岁)","4"
|
||||
"60岁及以上","2"
|
|
|
@ -0,0 +1,10 @@
|
|||
"科目","本期数","上年同期数","变动比例(%)"
|
||||
"营业收入","134591377.00","318486074.97","-57.74"
|
||||
"营业成本","29864436.32","50588057.11","-40.97"
|
||||
"销售费用","77073744.58","107494355.33","-28.30"
|
||||
"管理费用","58638054.44","60622550.89","-3.27"
|
||||
"财务费用","42981.30","-355527.32","不适用"
|
||||
"研发费用","15471820.82","32409476.90","-52.26"
|
||||
"经营活动产生的现金流量净额","80904692.08","38595320.99","109.62"
|
||||
"投资活动产生的现金流量净额","-187707765.08","112695639.52","-266.56"
|
||||
"筹资活动产生的现金流量净额","2517734.96","-13250290.31","不适用"
|
|
|
@ -0,0 +1,2 @@
|
|||
"主营业务分行业情况"
|
||||
"营业收入营业成本毛利率毛利率分行业营业收入营业成本比上年增比上年增比上年(%)减(%)减(%)增减"
|
|
|
@ -0,0 +1,11 @@
|
|||
"(%)","(%)","(%)","(%)","(%)","(%)","(%)"
|
||||
"减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点","减少生物制药134591377.0029864436.3277.81-57.74-40.976.31个百分点"
|
||||
"主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况","主营业务分产品情况"
|
||||
"分产品","营业收入","营业成本","毛利率(%)","营业收入比上年增减(%)","营业成本比上年增减(%)","毛利率比上年增减(%)"
|
||||
"四价流感病毒裂解疫苗","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
|
||||
"主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况","主营业务分地区情况"
|
||||
"分地区","营业收入","营业成本","毛利率(%)","营业收入比上年增减(%)","营业成本比上年增减(%)","毛利率比上年增减(%)"
|
||||
"国内","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
|
||||
"主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况","主营业务分销售模式情况"
|
||||
"销售模式","营业收入","营业成本","毛利率(%)","营业收入比上年增减(%)","营业成本比上年增减(%)","毛利率比上年增减(%)"
|
||||
"直销","134591377.00","29864436.32","77.81","-57.74","-40.97","减少6.31个百分点"
|
|
|
@ -0,0 +1,2 @@
|
|||
"主要产品","单位","生产量","销售量","库存量","生产量比上年增减(%)","销售量比上年增减(%)","库存量比上年增减(%)"
|
||||
"四价流感病毒裂解疫苗","瓶","2945705","1381358","1152015","-53.20","-51.74","-63.93"
|
|
|
@ -0,0 +1,11 @@
|
|||
"分行业","成本构成项目","本期金额","本期占总成本比例(%)","上年同期金额","上年同期占总成本比例(%)","本期金额较上年同期变动比例(%)","情况说明"
|
||||
"生物制药","直接材料","11118814.64","37.23","12840750.18","25.38","-13.41",""
|
||||
"","直接人工","1506181.29","5.04","2408448.11","4.76","-37.46","销量减少所致"
|
||||
"","制造费用","9877150.51","33.07","16580810.13","32.78","-40.43",""
|
||||
"","运输费用","7362289.88","24.66","18758048.69","37.08","-60.75",""
|
||||
"分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况","分产品情况"
|
||||
"分产品","成本构成项目","本期金额","本期占总成本比例(%)","上年同期金额","上年同期占总成本比例(%)","本期金额较上年同期变动比例(%)","情况说明"
|
||||
"四价流感病毒裂解疫苗","直接材料","11118814.64","37.23","12840750.18","25.38","-13.41",""
|
||||
"","直接人工","1506181.29","5.04","2408448.11","4.76","-37.46","销量减少所致"
|
||||
"","制造费用","9877150.51","33.07","16580810.13","32.78","-40.43",""
|
||||
"","运输费用","7362289.88","24.66","18758048.69","37.08","-60.75",""
|
|
|
@ -0,0 +1,5 @@
|
|||
"2","客户二","509.71","3.79","否"
|
||||
"3","客户三","318.08","2.36","否"
|
||||
"4","客户四","309.50","2.30","否"
|
||||
"5","客户五","256.49","1.91","否"
|
||||
"合计","/","2214.00","16.45","/"
|
|
|
@ -0,0 +1,7 @@
|
|||
"序号","供应商名称","采购额","占年度采购总额比例(%)","是否与上市公司存在关联关系"
|
||||
"1","供应商一","1599.68","15.59","否"
|
||||
"2","供应商二","1084.77","10.57","否"
|
||||
"3","供应商三","941.52","9.18","否"
|
||||
"4","供应商四","885.84","8.63","否"
|
||||
"5","供应商五","849.64","8.28","否"
|
||||
"合计","/","5361.45","52.25","/"
|
|
|
@ -0,0 +1,5 @@
|
|||
"科目","本期数","上年同期数","变动比例(%)"
|
||||
"销售费用","77073744.58","107494355.33","-28.30"
|
||||
"管理费用","58638054.44","60622550.89","-3.27"
|
||||
"财务费用","42981.30","-355527.32","不适用"
|
||||
"研发费用","15471820.82","32409476.90","-52.26"
|
|
|
@ -0,0 +1,3 @@
|
|||
"科目","本期数","上年同期数","变动比例(%)"
|
||||
"经营活动产生的现金流量净额","80904692.08","38595320.99","109.62"
|
||||
"投资活动产生的现金流量净额","-187707765.08","112695639.52","-266.56"
|
|
|
@ -0,0 +1,13 @@
|
|||
"项目名称","本期期末数","本期期末数占总资产的比例(%)","上期期末数","上期期末数占总资产的比例(%)","本期期末金额较上期期末变动比例(%)","情况说明"
|
||||
"货币资金","70443588.78","4.32","174728926.82","9.56","","-59.68说明1"
|
||||
"交易性金融资产","175421746.58","10.75","390568609.77","21.38","","-55.09说明2"
|
||||
"预付款项","2825253.64","0.17","5735966.10","0.31","","-50.74说明3"
|
||||
"其他应收款","479099.87","0.03","542645.12","0.03","-11.71",""
|
||||
"在建工程","649464436.15","39.81","619862948.00","33.93","4.78",""
|
||||
"长期待摊费用","248564.85","0.02","1626952.89","0.09","","-84.72说明4"
|
||||
"递延所得税资产","33313943.01","2.04","17752280.68","0.97","","87.66说明5"
|
||||
"其他非流动资产","3358975.00","0.21","3888619.41","0.21","-13.62",""
|
||||
"短期借款","64057597.23","3.93","42041861.11","2.30","","52.37说明6"
|
||||
"应付账款","86670216.00","5.31","98922415.32","5.42","-12.39",""
|
||||
"合同负债","0.00","0.00","50000.00","0.00","","-100.00说明7"
|
||||
"应交税费","1046668.08","0.06","1168680.25","0.06","-10.44",""
|
|
|
@ -0,0 +1,2 @@
|
|||
"细分行业","主要治疗领域","药(产)品名称","注册分类","适应症或功能主治","是否处方药","是否属于中药保护品种(如涉及)","发明专利起止期限(如适用)","是否属于报告期内推出的新药(产)品","是否纳入国家基药目录","是否纳入国家医保目录","是否纳入省级医保目录"
|
||||
"生物制药","预防流行性感冒","四价流感病毒裂解疫苗","预防用生物制品","预防流行性感冒","否","否","2020-05-05至2037-08-23","否","否","否","否"
|
|
|
@ -0,0 +1,7 @@
|
|||
"研发项目(含一致性评价项目)","药(产)品名称","注册分类","适应症或功能主治","是否处方药","是否属于中药保护品种(如涉及)","研发(注册)所处阶段"
|
||||
"冻干人用狂犬病疫苗(Vero细胞)","冻干人用狂犬病疫苗(Vero细胞)","预防用生物制品3.3类","预防狂犬病","否","否","申报注册"
|
||||
"四价流感病毒裂解疫苗(儿童)","四价流感病毒裂解疫苗(儿童)","预防用生物制品3.3类","预防流行性感冒","否","否","临床试验"
|
||||
"23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","预防用生物制品3.3类","预防肺炎","否","否","临床前研究"
|
||||
"冻干水痘减毒活疫苗","冻干水痘减毒活疫苗","预防用生物制品3.3类","预防水痘","否","否","临床前研究"
|
||||
"四价流感病毒裂解疫苗(高剂量)","四价流感病毒裂解疫苗(高剂量)","预防用生物制品3.2类","预防流行性感冒","否","否","临床前研究"
|
||||
"重组带状疱疹疫苗","重组带状疱疹疫苗","预防用生物制品3.3类","预防带状疱疹","否","否","临床前研究"
|
|
|
@ -0,0 +1,3 @@
|
|||
"冻干人用狂犬病疫苗(MRC-5细胞)","冻干人用狂犬病疫苗(MRC-5细胞)","预防用生物制品3.3类","预防狂犬病","否","否","临床前研究"
|
||||
"多价手足口病疫苗","多价手足口病疫苗","预防用生物制品1.4类","预防多价手足口病","否","否","临床前研究"
|
||||
"注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","治疗用生物制品1类","实体瘤治疗","否","否","临床前研究"
|
|
|
@ -0,0 +1,10 @@
|
|||
"同行业可比公司","研发投入金额","研发投入占营业收入比例(%)","研发投入占净资产比例(%)","研发投入资本化比重(%)"
|
||||
"长春百克生物科技股份有限公司","19874.22","10.89","4.94","5.74"
|
||||
"云南沃森生物技术股份有限公司","91061.04","22.14","9.74","14.74"
|
||||
"华兰生物疫苗股份有限公司","9321.33","3.87","1.49","2.26"
|
||||
"康希诺生物股份公司","66167.10","185.3","12.51","3.58"
|
||||
"北京万泰生物药业股份有限公司","129251.30","23.45","10.03","7.32"
|
||||
"同行业平均研发投入金额","同行业平均研发投入金额","53136.88","53136.88","53136.88"
|
||||
"公司报告期内研发投入占营业收入比例(%)","公司报告期内研发投入占营业收入比例(%)","23.38","23.38","23.38"
|
||||
"公司报告期内研发投入占净资产比例(%)","公司报告期内研发投入占净资产比例(%)","2.29","2.29","2.29"
|
||||
"公司报告期内研发投入资本化比重(%)","公司报告期内研发投入资本化比重(%)","50.82","50.82","50.82"
|
|
|
@ -0,0 +1,11 @@
|
|||
"研发项目","研发投入金额","研发投入费用化金额","研发投入资本化金额","研发投入占营业收入比例(%)","本期金额较上年同期变动比例(%)","情况说明"
|
||||
"冻干人用狂犬病疫苗(Vero细胞)","1599.09","","1599.09","11.88","16.44",""
|
||||
"四价流感病毒裂解疫苗(儿童)","410.69","410.69","","3.05","349.70","本报告期该项目已完成Ⅰ期临床试验,正在进行III期临床试验前期准备工作,研发投入同比增加。"
|
||||
"23价肺炎球菌多糖疫苗/13价肺炎球菌多糖结合疫苗","123.49","123.49","","0.92","20.09",""
|
||||
"冻干水痘减毒活疫苗","225.03","225.03","","1.67","-18.47",""
|
||||
"四价流感病毒裂解疫苗(高剂量)","110.64","110.64","","0.82","-92.85","本报告期该项目处于临床前研究阶段,研发投入同比减少。"
|
||||
"重组带状疱疹疫苗","168.99","168.99","","1.26","80.87","本报告期该项目处于临床前研究阶段,技术服务费研发投入同比增加。"
|
||||
"冻干人用狂犬病疫苗(MRC-5细胞)","33.77","33.77","","0.25","-55.03","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
|
||||
"多价手足口病疫苗","33.77","33.77","","0.25","-63.65","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
|
||||
"注射用重组人IL12/15-PDL1单纯疱疹I型溶瘤病毒注射液","33.49","33.49","","0.25","-57.64","本报告期该项目处于临床前研究阶段,研发投入同比增加。"
|
||||
"在中国3至8岁儿童中四价流感病毒裂解疫苗2针次免疫程序的探索研究","54.38","54.38","","0.40","-24.28","本报告期该项目临床研究完成,研发投入同比减少。"
|
|
|
@ -0,0 +1,9 @@
|
|||
"具体项目名称","本期发生额","本期发生额占销售费用总额比例(%)"
|
||||
"薪酬及社保费用","862.50","11.19"
|
||||
"差旅费","66.75","0.87"
|
||||
"业务招待费","35.21","0.46"
|
||||
"销售服务费","6469.41","83.93"
|
||||
"办公费","6.33","0.08"
|
||||
"会议费","212.51","2.76"
|
||||
"其他","54.66","0.71"
|
||||
"合计","7707.37","100.00"
|
|
|
@ -0,0 +1,8 @@
|
|||
"同行业可比公司","销售费用","销售费用占营业收入比例(%)"
|
||||
"长春百克生物科技股份有限公司","64716.89","35.47"
|
||||
"云南沃森生物技术股份有限公司","151957.55","36.94"
|
||||
"华兰生物疫苗股份有限公司","94899.25","39.37"
|
||||
"康希诺生物股份公司","35339.54","98.97"
|
||||
"北京万泰生物药业股份有限公司","159509.44","28.94"
|
||||
"公司报告期内销售费用总额","公司报告期内销售费用总额","7707.37"
|
||||
"公司报告期内销售费用占营业收入比例(%)","公司报告期内销售费用占营业收入比例(%)","57.26"
|
|
|
@ -0,0 +1,3 @@
|
|||
"资产类别","期初数","本期公允价值变动损益","计入权益的累计公允价值变动","本期计提的减值","本期购买金额","本期出售/赎回金额","其他变动","期末数"
|
||||
"其他","390568609.77","-146863.19","","","","215000000.00","","175421746.58"
|
||||
"合计","390568609.77","-146863.19","","","","215000000.00","","175421746.58"
|
|
|
@ -0,0 +1,3 @@
|
|||
"备查文件目录","载有公司负责人、主管会计工作负责人、会计机构负责人(会计主管人员)签名并盖章的财务报表"
|
||||
"","载有会计师事务所盖章、注册会计师签名并盖章的审计报告原件"
|
||||
"","报告期内公开披露过的所有公司文件的正本及公告的原稿。"
|
|
|
@ -0,0 +1,2 @@
|
|||
"会议届次","召开日期","决议刊登的指定网站的查询索引","决议刊登的披露日期","会议决议"
|
||||
"2022年年度股东大会","2023年5月10日","www.sse.com.cn","2023年5月11日","议案全部审议通过"
|
|
|
@ -0,0 +1,10 @@
|
|||
"姓名","职务","性别","年龄","任期起始日期","任期终止日期","","年初持股数年末持股数","年度内股份增减变动量","增减变动原因","报告期内从公司获得的税前报酬总额(万元)","是否在公司关联方获取报酬"
|
||||
"余军","董事长、核心技术人员","男","55","2020-06-15","2026-05-10","27049291","37869007","10819716","资本公积金转增股本","128.87否",""
|
||||
"张良斌","董事","男","49","2020-06-15","2026-05-10","27049291","37869008","10819717","资本公积金转增股本","","0是"
|
||||
"聂申钱","董事","男","76","2020-06-15","2026-05-10","3381159","4733623","1352464","资本公积金转增股本","","0是"
|
||||
"夏建国","董事、副总经理","男","51","2020-06-15","2026-05-10","2086865","2921611","","834746不适用","88.87否",""
|
||||
"邵蓉","独立董事女","","62","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
|
||||
"管建强","独立董事男","","66","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
|
||||
"程华(辞职)","独立董事女","","45","2020-06-15","2024-01-10","0","0","","0不适用","","12否"
|
||||
"魏大昌","监事会主席","男","56","2020-06-15","2026-05-10","0","0","","0不适用","62.78否",""
|
||||
"余晖晟","职工监事男","","28","2020-06-15","2026-05-10","0","0","","0不适用","8.79否",""
|
|
|
@ -0,0 +1,11 @@
|
|||
"黄玲","监事","女","58","2020-06-15","2026-05-10","0","0","","0不适用","","12否"
|
||||
"张建辉","总经理","男","66","2023-10-27","2026-05-10","4057394","5680352","1622958","资本公积金转增股本","30.50是",""
|
||||
"任晚琼(离职)","副总经理女","","54","2020-06-15","2023-10-27","0","0","","0不适用","88.86否",""
|
||||
"樊长勇","副总经理男","","45","2020-06-15","2026-05-10","0","0","","0不适用","56.77否",""
|
||||
"田国雄","副总经理男","","45","2022-05-30","2026-05-10","0","0","","0不适用","118.03否",""
|
||||
"滕红刚(离职)","副总经理男","","52","2022-05-30","2023-05-10","0","0","","0不适用","29.31否",""
|
||||
"黄强","财务总监男","","44","2022-04-08","2026-05-10","0","0","","0不适用","63.17否",""
|
||||
"李志刚(离职)","副总经理男","","42","2023-10-27","2023-12-22","0","0","","0不适用","41.93否",""
|
||||
"赵巍(离职)","副总经理男","","45","2023-10-27","2023-12-22","0","0","","0不适用","18.29否",""
|
||||
"吴建华","核心技术人员","男","49","2009-10-01","-","0","0","","0不适用","40.91否",""
|
||||
"合计","/","/","/","/","/","","","","/","825.08","/"
|
|
|
@ -0,0 +1,3 @@
|
|||
"姓名","主要工作经历"
|
||||
"余军","1992年8月至1993年10月任临川中心血站技术员;1993年11月至2000年12月任博雅生物制药股份有限公司生产经理;2001年1月至2002年5月任北京耀华生物技术有限公司总工程师;2002年6月至2005年7月任广东佰易药业有限公司副总经理;2005年10月至2014年3月任同路生物制药有限公司副总经理;2014年6月至2015年5月任海南中和药业有限公司副总经理;2015年6月至2023年10月担任公司核心技术人员、董事长、总经理,2023年10月至今担任公司核心技术人员、董事长。"
|
||||
"张良斌","1999年2月至2000年5月任博雅生物制药股份有限公司出纳;2000年5月至2001年10月任广东康之选医药连锁有限公司配送中心经理;2001年10月至2005年12月任广东佰易药业有限公司销售部经理;2006年1月至今任同路生物制药有限公司副总经理;2017"
|
|
|
@ -0,0 +1,12 @@
|
|||
"","年2月至今任浙江海康生物制品有限责任公司董事;2016年8月至今担任广东上量投资有限公司监事;2015年6月至今任公司董事。"
|
||||
"聂申钱","1969年2月至1987年11月任中国人民解放军海军航空兵部队干部;1987年12月至1993年8月任中国预防医学科学院中预公司经理;1993年9月至2011年9月任中信医药实业有限公司总经理;2011年10月至2013年12月任上药科园信海医药有限公司党委书记;2014年7月至2016年11月任海南中和药业有限公司董事兼总经理;2016年11月至今任海南中和药业股份有限公司董事、高级顾问;2020年4月至今担任海南妙峰山健康产业有限公司执行董事兼总经理;2015年6月至今任公司董事。"
|
||||
"夏建国","1995年8月至1998年8月任南京药械厂制药机械研究所设计师;1998年8月至2000年12月任博雅生物制药股份有限公司冻干技师;2001年1月至2002年8月任深圳海普瑞生物技术有限公司工程部主管;2002年9月至2005年12月任广东佰易药业有限公司工程部经理;2006年1月至2015年5月任同路生物制药有限公司项目总监;2015年6月至今担任公司董事、副总经理。"
|
||||
"邵蓉","2020年6月至今担任公司独立董事。现就职于中国药科大学,任国家药物政策与医药产业经济研究中心执行副主任,教授、博士生导师,兼任天境生物(I-Mab)独立董事、江苏当代国安律师事务所执业律师、中国药学会理事、中国药品监督管理研究会政策与法规专业委员会主任委员、中国药促会监事等职。"
|
||||
"管建强","2020年6月至今担任公司独立董事。现担任华东政法大学教授和博士生导师,兼任江苏图南合金股份有限公司独立董事。"
|
||||
"程华(辞职)","2020年6月至2023年12月,担任公司独立董事。现担任财政部会计准则委员会高级会计师,兼任中国财政科学研究院硕士生导师、湘财股份有限公司独立董事、悦康药业集团股份有限公司独立董事、山东步长制药股份有限公司独立董事等职。"
|
||||
"魏大昌","1988年11月至1993年9月,任成都军区后勤部供血站精制组长;1993年9月至1998年11月,任江西省博达生物工程研究所工程师;1998年11月至2005年3月,任广东湛江双林生物制药有限公司总经理助理兼生产部部长;2005年3月至2005年10月,任广东佰易药业有限公司生产部经理;2005年11月至2016年6月,任同路生物制药有限公司生产部经理;2016年6月至2018年5月,任中科生物制药有限公司血制项目总监;2018年6月至2019年6月,任通盈生物制药有限公司血制项目总监;2019年7月至今任公司包装部经理(总监),2020年6月至今任公司监事会主席。"
|
||||
"余晖晟","2017年2月至今为公司车间员工,2020年6月至今任公司职工代表监事。"
|
||||
"黄玲","1988年9月至1993年10月任北京东风制药厂技术员;1993年10月至1995年8月,任北京亚都生物公司技术员;1997年10月至2003年3月,任北京巨能公司研究员;2003年3月至今任北京秦脉医药咨询有限责任公司咨询师;2020年6月至今任公司监事。"
|
||||
"张建辉","1975年1月至1997年12月在江西省抚州地区煤炭公司任职;1998年1月至2007年1月任江西省崇仁县单采血浆站站长;2007年2月至2009年1月任博雅生物制药股份有限公司副总经理;2009年2月至2011年12月任同路生物制药有限公司血浆部副总经理;2012年1月至今任郴州市云鼎房地产有限公司董事长;2020年5月至今任福建省宏冠房地产开发有限公司董事长,2023年10月至今担任公司总经理。"
|
||||
"任晚琼(离职)","1993年8月至2010年7月任职于河南欣泰药业有限公司,历任质检科职员、质检科主任、质量保证部部长、副总经理;2010年8月至2015年2月任河南远大生物制药有限公司副总经理;2015年6月至2019年6月任公司质量总监;2019年6月至2023年10月任公司副总经理。"
|
||||
"樊长勇","2001年7月至2004年1月任上海九鼎粉体材料有限公司技术员;2004年1月至2007年7月任上海界龙实业股份有限公司高级经理;2007年7月至2009年8月任国信证券股份有限公司投资银行高级经理;2009年9月至2015年6月任中信证券股份有限公司投资银行"
|
|
|
@ -0,0 +1,7 @@
|
|||
"","委员会副总裁(VP)、高级副总裁(SVP)、保荐代表人;2016年4月至2020年4月任上海莱士血液制品股份有限公司董事长助理,2018年9月至2020年4月任同方莱士医药产业投资(广东)有限公司总经理;2020年5月至2024年1月任公司副总经理兼董事会秘书。"
|
||||
"田国雄","2002年7月至2005年12月先后任广东佰易药业有限公司地区商务经理、地区销售经理,2006年1月至2022年3月先后任同路生物制药有限公司地区销售经理、大区销售经理、大区销售总监。2022年5月起任公司副总经理。"
|
||||
"滕红刚(离职)","1995年9月至2000年7月在长春生物制品所病毒研究室工作,2003年7月在长春生物制品所获得免疫学硕士学位,2006年7月在吉林大学生命科学学院获得生物化学与分子生物学专业博士学位,2006年10月至2007年4月任中国科学院广州生物医药与健康研究院研究助理,2007年6月至2009年6月任吉林亚泰生物药业股份有限公司副总经理,2009年6月至2011年10月任鸿达生物药业长春股份有限公司副总经理,2011年12月至2015年5月任长春卫尔赛生物药业有限公司生产总监,2015年8月至2016年5月任霍普金斯医药研究院长春分院院长,2016年8月至2022年3月先后任辽宁依生生物制药有限公司副总经理、总经理。2022年5月至2023年5月任公司副总经理。"
|
||||
"黄强","2002年7月至2016年3月历任河南神火煤电股份有限公司(000933.SZ)财务部科员、副科长、科长;2016年4月至2021年3月历任海南中和药业股份有限公司证券事务代表、董事会办公室主任、财务副总监、总经理助理;2021年4月至2022年4月任江苏金迪克生物技术股份有限公司财务副总监。2022年4月起任公司财务总监。"
|
||||
"李志刚(离职)","2008年3月至2010年3月任牛津大学高级研究助理;2010年8月至2012年10月任北京必威安泰生物科技有限公司研发项目负责人;2012年10月至2018年3月任北京生物制品研究所有限公司经理、副主任等职;2018年4月至2019年7月任中国生物技术股份有限公司部长助理;2019年8月至2021年5月任北京民海生物科技有限公司质量合规总监;2021年5月至2022年4月任斯微(上海)生物科技有限公司副总裁;2022年5月至2023年10月任君拓生物医药科技(海南)有限公司副总裁。2023年10月至2023年12月担任公司副总经理。"
|
||||
"赵巍(离职)","2000年10月至2002年9月任武汉海特生物制药股份有限公司员工;2002年9月至2004年6月就读于华中科技大学同济医学院,获学士学位;2005年9月至2007年6月就读于武汉大学,获硕士学位;2007年7月至2023年3月历任武汉生物制品研究所有限责任公司流感疫苗课题组第二课题负责人、病毒性疫苗研究二室主任、流感病毒疫苗室主任。2023年3月至2023年10月任上海君拓生物医药科技有限公司总裁助理(兼无锡君和生物医药科技有限公司副总经理)。2023年10月至2023年12月担任公司副总经理。"
|
||||
"吴建华","1998年6月至2003年3月,任浙江天元生物药业股份有限公司生产技术员、研发助理工程师;2003年3月至2009年9月,任北京金迪克生物技术研究所研发主管;2009年10月至今任公司质量控制部经理。吴建华主要负责公司四价流感病毒裂解疫苗的临床前和临床试验研究、生产工艺研究和质量控制研究工作,以及公司冻干人用狂犬病疫苗(Vero细胞)、四价流感病毒裂解疫苗(儿童)、四价流感病毒裂解疫苗(高剂量)、冻干水痘减毒活疫苗、冻干带状疱疹减毒活疫苗等在研项目的质量控制研究工作。"
|
|
|
@ -0,0 +1,4 @@
|
|||
"任职人员姓名","股东单位名称","在股东单位担任的职务","","任期起始日期任期终止日期"
|
||||
"余军","泰州同泽","执行事务合伙人","2020年5月",""
|
||||
"张良斌","泰州同人","执行事务合伙人","2020年5月",""
|
||||
"在股东单位任职情况的说明","不适用","不适用","不适用","不适用"
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue