pdf_code/zzb_data_prod/test_0711_v2.py

328 lines
16 KiB
Python
Raw Normal View History

2024-11-29 15:58:06 +08:00
import camelot
import re
#from multiprocessing import Pool
import os, time, random
import json
#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
from datetime import datetime
# 读取PDF
import PyPDF2
# 分析PDF的layout提取文本
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import mysql.connector
#import utils
from pymilvus import MilvusClient
#import llm_service
#import db_service
#import pdf_title
import numpy as np
#from multiprocessing import Process
2025-09-02 15:23:55 +08:00
import logging
logger = logging.getLogger(__name__)
2024-11-29 15:58:06 +08:00
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
#负责表内一旦出现某个字符,整个表丢弃
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类'
#unit_pattern = re.compile(r'单位[|:]?(百万元|千万元|亿元|万元|千元|元)')
MUILT_PATTERN = '调整前'
file_path = r"combined_v61.pdf"
file_id = 1
pages = '1-2'
tables_range = {}
# def get_table_range_test(file_path, file_id, pages, tables_range):
# print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
# #(f'file_path: {file_path},file_id:{file_id},pages:{pages},tables_range:{tables_range}')
# start = time.time()
# import tempfile
# temp_dir_path = "F:\\temp"
# # 检查并创建临时文件夹
# if not os.path.exists(temp_dir_path):
# os.makedirs(temp_dir_path)
# # 创建临时文件夹
# temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
# # 设置全局临时文件夹路径
# os.environ["TMP"] = temp_dir
# os.environ["TEMP"] = temp_dir
# # conn = mysql.connector.connect(
# # host= MYSQL_HOST,
# # user= MYSQL_USER,
# # password= MYSQL_PASSWORD,
# # database= MYSQL_DB
# # )
# # 创建一个cursor对象来执行SQL语句
# #print(f'file_path的值是{file_path}')
# #cursor = conn.cursor()
# # try:
# # tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
# # print('读取成功')
# # except Exception as e:
# # print(f'错误在{e}')
# #print(f'file_path的值是{file_path}')
# #file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
# os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"
# try:
# # 确保 file_path 是正确的,并且文件是可访问的
# if not os.path.exists(file_path):
# print(f'文件路径不正确或文件不存在: {file_path}')
# raise FileNotFoundError(f"文件不存在:{file_path}")
# else:
# pass#(f'file_path是存在的就是{file_path}')
# # 读取 PDF 文件
# #tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
# #tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
# tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)#line_scale=10,
# print('读取成功')
# print("检测到的表格数量:", tables.n)
# except FileNotFoundError as fe:
# print(fe)
# except Exception as e:
# print(f'处理PDF时出错: {e}')
# for t in tables:
# top = t._bbox[3]
# buttom = t._bbox[1]
# page_num = int(t.page)
# table_index = int(t.order)
# arr = np.array(t.data)
# #recent_value = None
# #这里开始对可能解析错误的值做判断:
# for i, row in enumerate(arr):
# if len(row) >= 4:
# # first_value = row[0]
# # if ("2023年度" in first_value or "2022年度" in first_value) and len(first_value) <= 12:
# # recent_value = first_value
# # if first_value == '' and recent_value:
# # row[0] = recent_value
# # 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
# if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
# split_values = row[2].split('.')
# # 确保可以正确拆分成三个数值
# if len(split_values) == 4:
# new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
# new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
# new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
# row[1] = new_value1
# row[2] = new_value2
# row[3] = new_value3
# #检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
# if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
# split_value_3 = row[2].split('.')
# split_value_5 = row[4].split('.')
# if len(split_value_3) == 3:
# new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
# new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
# if len(split_value_5) == 3:
# new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
# new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
# row[1] = new_value2
# row[2] = new_value3
# row[3] = new_value4
# row[4] = new_value5
# #检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
# if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
# split_values = row[2].split('.')
# if len(split_values) == 3:
# new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
# new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
# row[1] = new_value2
# row[2] = new_value3
# #检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
# if not row[0].replace('.', '', 1).isdigit():
# for i in range(1, len(row) - 1):
# if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
# split_values = row[i + 1].split('%')
# new_value1 = f"{split_values[0]}%"
# new_value2 = f"{split_values[1]}"
# row[i] = new_value1
# row[i + 1] = new_value2
# break
# #检查条件当一个列表中同时出现了2022年12月31日和2023年1月1日时【并且都只出现1次】在2022年12月31日后面增加“调整前”字段
# # if sum(1 for item in row if item.strip() == "2023年1月1日") == 1 and sum(1 for item in row if item.strip() == "2022年12月31日") == 1:
# # for i, item in enumerate(row):
# # stripped_item = item.strip() #去空格
# # if stripped_item == "2022年12月31日":
# # row[i] = stripped_item + '调整前'
# new_data = arr.tolist()#用于后面保存到数据库中
# rows, cols = arr.shape
# if rows == 1 and cols == 1:
# continue
# arr_str = ''.join([''.join(map(str, row)) for row in arr])
# #print(f'arr_str的值是 {arr_str}')
# #过滤掉不包含需抽取指标表格的文本
# matches = re.findall(STR_PATTERN, arr_str)
# pattern = re.findall(PATTERN,arr_str)
# muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
# if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
# if not tables_range.get(page_num):
# tables_range[page_num] = []
# tables_range[page_num].append({
# 'top' : top,
# 'buttom' : buttom,
# 'table_index' : table_index,
# 'page_num' : page_num,
# })
# print(f"tables_range的值是{tables_range}")
# #(f'file_id是{file_id}')
# # db_service.insert_pdf_parse_process({
# # 'file_id': file_id,
# # 'page_num' : page_num,
# # 'page_count' : 100,
# # 'type' : 'parse_table',
# # 'content':{
# # 'top' : top,
# # 'buttom' : buttom,
# # 'page_num' : page_num,
# # 'table_index' : table_index,
# # "type" : "table",
# # "data" : new_data,
# # 'sort_num' : page_num*1000 - top
# # }},conn,cursor)
# #get_text_content(file_path, file_id, tables_range, pages, conn, cursor)
# # cursor.close()
# # conn.close()
# end = time.time()
# print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def get_table_range_test(file_path, file_id, pages, tables_range):
2025-09-02 15:23:55 +08:00
logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
2024-11-29 15:58:06 +08:00
start = time.time()
# conn = mysql.connector.connect(
# host= MYSQL_HOST,
# user= MYSQL_USER,
# password= MYSQL_PASSWORD,
# database= MYSQL_DB
# )
# 创建一个cursor对象来执行SQL语句
#cursor = conn.cursor()
#redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
for t in tables:
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
arr = np.array(t.data)
#这里开始对可能解析错误的值做判断:
for i, row in enumerate(arr):
if len(row) >= 4:
# 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
split_values = row[2].split('.')
# 确保可以正确拆分成三个数值
if len(split_values) == 4:
new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
row[1] = new_value1
row[2] = new_value2
row[3] = new_value3
#检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
split_value_3 = row[2].split('.')
split_value_5 = row[4].split('.')
if len(split_value_3) == 3:
new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
if len(split_value_5) == 3:
new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
row[1] = new_value2
row[2] = new_value3
row[3] = new_value4
row[4] = new_value5
#检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
split_values = row[2].split('.')
if len(split_values) == 3:
new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
row[1] = new_value2
row[2] = new_value3
#检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
if not row[0].replace('.', '', 1).isdigit():
for i in range(1, len(row) - 1):
if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
split_values = row[i + 1].split('%')
new_value1 = f"{split_values[0]}%"
new_value2 = f"{split_values[1]}"
row[i] = new_value1
row[i + 1] = new_value2
break
new_data = arr.tolist()#用于后面保存到数据库中
rows, cols = arr.shape
if rows == 1 and cols == 1:
continue
arr_str = ''.join([''.join(map(str, row)) for row in arr])
#过滤掉不包含需抽取指标表格的文本
matches = re.findall(STR_PATTERN, arr_str)
pattern = re.findall(PATTERN,arr_str)
muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
2025-09-02 15:23:55 +08:00
logger.debug(f"tables_range的值是{tables_range}")
2024-11-29 15:58:06 +08:00
# db_service.insert_pdf_parse_process({
# 'file_id': file_id,
# 'page_num' : page_num,
# 'page_count' : 100,
# 'type' : 'parse_table',
# 'content':{
# 'top' : top,
# 'buttom' : buttom,
# 'page_num' : page_num,
# 'table_index' : table_index,
# "type" : "table",
# "data" : new_data,
# 'sort_num' : page_num*1000 - top
# }},conn,cursor)
# get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client)
# cursor.close()
# conn.close()
# redis_client.close()
end = time.time()
2025-09-02 15:23:55 +08:00
logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
2024-11-29 15:58:06 +08:00
get_table_range_test(file_path, file_id, pages, tables_range)