2024-11-29 15:58:06 +08:00
import camelot
import re
#from multiprocessing import Pool
import os , time , random
import json
#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
from datetime import datetime
# 读取PDF
import PyPDF2
# 分析PDF的layout, 提取文本
from pdfminer . high_level import extract_pages
from pdfminer . layout import LTTextBoxHorizontal
import pdfplumber
import mysql . connector
#import utils
from pymilvus import MilvusClient
#import llm_service
#import db_service
#import pdf_title
import numpy as np
#from multiprocessing import Process
2025-09-02 15:23:55 +08:00
import logging
logger = logging . getLogger ( __name__ )
2024-11-29 15:58:06 +08:00
STR_PATTERN = ' 营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入 '
#负责表内一旦出现某个字符,整个表丢弃
PATTERN = ' 品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类 '
#unit_pattern = re.compile(r'单位[: |:]?(百万元|千万元|亿元|万元|千元|元)')
MUILT_PATTERN = ' 调整前 '
file_path = r " combined_v61.pdf "
file_id = 1
pages = ' 1-2 '
tables_range = { }
# def get_table_range_test(file_path, file_id, pages, tables_range):
# print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
# #(f'file_path: {file_path},file_id:{file_id},pages:{pages},tables_range:{tables_range}')
# start = time.time()
# import tempfile
# temp_dir_path = "F:\\temp"
# # 检查并创建临时文件夹
# if not os.path.exists(temp_dir_path):
# os.makedirs(temp_dir_path)
# # 创建临时文件夹
# temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
# # 设置全局临时文件夹路径
# os.environ["TMP"] = temp_dir
# os.environ["TEMP"] = temp_dir
# # conn = mysql.connector.connect(
# # host= MYSQL_HOST,
# # user= MYSQL_USER,
# # password= MYSQL_PASSWORD,
# # database= MYSQL_DB
# # )
# # 创建一个cursor对象来执行SQL语句
# #print(f'file_path的值是{file_path}')
# #cursor = conn.cursor()
# # try:
# # tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
# # print('读取成功')
# # except Exception as e:
# # print(f'错误在{e}')
# #print(f'file_path的值是{file_path}')
# #file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
# os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"
# try:
# # 确保 file_path 是正确的,并且文件是可访问的
# if not os.path.exists(file_path):
# print(f'文件路径不正确或文件不存在: {file_path}')
# raise FileNotFoundError(f"文件不存在:{file_path}")
# else:
# pass#(f'file_path是存在的就是{file_path}')
# # 读取 PDF 文件
# #tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
# #tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
# tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)#line_scale=10,
# print('读取成功')
# print("检测到的表格数量:", tables.n)
# except FileNotFoundError as fe:
# print(fe)
# except Exception as e:
# print(f'处理PDF时出错: {e}')
# for t in tables:
# top = t._bbox[3]
# buttom = t._bbox[1]
# page_num = int(t.page)
# table_index = int(t.order)
# arr = np.array(t.data)
# #recent_value = None
# #这里开始对可能解析错误的值做判断:
# for i, row in enumerate(arr):
# if len(row) >= 4:
# # first_value = row[0]
# # if ("2023年度" in first_value or "2022年度" in first_value) and len(first_value) <= 12:
# # recent_value = first_value
# # if first_value == '' and recent_value:
# # row[0] = recent_value
# # 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
# if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
# split_values = row[2].split('.')
# # 确保可以正确拆分成三个数值
# if len(split_values) == 4:
# new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
# new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
# new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
# row[1] = new_value1
# row[2] = new_value2
# row[3] = new_value3
# #检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
# if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
# split_value_3 = row[2].split('.')
# split_value_5 = row[4].split('.')
# if len(split_value_3) == 3:
# new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
# new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
# if len(split_value_5) == 3:
# new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
# new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
# row[1] = new_value2
# row[2] = new_value3
# row[3] = new_value4
# row[4] = new_value5
# #检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
# if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
# split_values = row[2].split('.')
# if len(split_values) == 3:
# new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
# new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
# row[1] = new_value2
# row[2] = new_value3
# #检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
# if not row[0].replace('.', '', 1).isdigit():
# for i in range(1, len(row) - 1):
# if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
# split_values = row[i + 1].split('%')
# new_value1 = f"{split_values[0]}%"
# new_value2 = f"{split_values[1]}"
# row[i] = new_value1
# row[i + 1] = new_value2
# break
# #检查条件: 当一个列表中同时出现了2022年12月31日和2023年1月1日时【并且都只出现1次】, 在2022年12月31日后面增加“调整前”字段
# # if sum(1 for item in row if item.strip() == "2023年1月1日") == 1 and sum(1 for item in row if item.strip() == "2022年12月31日") == 1:
# # for i, item in enumerate(row):
# # stripped_item = item.strip() #去空格
# # if stripped_item == "2022年12月31日":
# # row[i] = stripped_item + '调整前'
# new_data = arr.tolist()#用于后面保存到数据库中
# rows, cols = arr.shape
# if rows == 1 and cols == 1:
# continue
# arr_str = ''.join([''.join(map(str, row)) for row in arr])
# #print(f'arr_str的值是 {arr_str}')
# #过滤掉不包含需抽取指标表格的文本
# matches = re.findall(STR_PATTERN, arr_str)
# pattern = re.findall(PATTERN,arr_str)
# muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
# if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
# if not tables_range.get(page_num):
# tables_range[page_num] = []
# tables_range[page_num].append({
# 'top' : top,
# 'buttom' : buttom,
# 'table_index' : table_index,
# 'page_num' : page_num,
# })
# print(f"tables_range的值是{tables_range}")
# #(f'file_id是{file_id}')
# # db_service.insert_pdf_parse_process({
# # 'file_id': file_id,
# # 'page_num' : page_num,
# # 'page_count' : 100,
# # 'type' : 'parse_table',
# # 'content':{
# # 'top' : top,
# # 'buttom' : buttom,
# # 'page_num' : page_num,
# # 'table_index' : table_index,
# # "type" : "table",
# # "data" : new_data,
# # 'sort_num' : page_num*1000 - top
# # }},conn,cursor)
# #get_text_content(file_path, file_id, tables_range, pages, conn, cursor)
# # cursor.close()
# # conn.close()
# end = time.time()
# print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def get_table_range_test ( file_path , file_id , pages , tables_range ) :
2025-09-02 15:23:55 +08:00
logger . info ( ' Run task %s ( %s )... ' % ( f ' 解析表格 { pages } ' , os . getpid ( ) ) )
2024-11-29 15:58:06 +08:00
start = time . time ( )
# conn = mysql.connector.connect(
# host= MYSQL_HOST,
# user= MYSQL_USER,
# password= MYSQL_PASSWORD,
# database= MYSQL_DB
# )
# 创建一个cursor对象来执行SQL语句
#cursor = conn.cursor()
#redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
tables = camelot . read_pdf ( file_path , pages = pages , strip_text = ' , \n ' , copy_text = [ ' h ' ] )
for t in tables :
top = t . _bbox [ 3 ]
buttom = t . _bbox [ 1 ]
page_num = int ( t . page )
table_index = int ( t . order )
arr = np . array ( t . data )
#这里开始对可能解析错误的值做判断:
for i , row in enumerate ( arr ) :
if len ( row ) > = 4 :
# 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
if ( not row [ 0 ] . replace ( ' . ' , ' ' , 1 ) . isdigit ( ) ) and ( row [ 1 ] == ' ' ) and ( len ( row [ 2 ] . split ( ' . ' ) ) == 4 and len ( row [ 2 ] . rsplit ( ' . ' , 1 ) [ - 1 ] ) == 2 ) and ( row [ 3 ] == ' ' ) :
split_values = row [ 2 ] . split ( ' . ' )
# 确保可以正确拆分成三个数值
if len ( split_values ) == 4 :
new_value1 = f " { split_values [ 0 ] } . { split_values [ 1 ] [ : 2 ] } "
new_value2 = f " { split_values [ 1 ] [ 2 : ] } . { split_values [ 2 ] [ : 2 ] } "
new_value3 = f " { split_values [ 2 ] [ 2 : ] } . { split_values [ 3 ] } "
row [ 1 ] = new_value1
row [ 2 ] = new_value2
row [ 3 ] = new_value3
#检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
if len ( row ) > = 5 and ( not row [ 0 ] . replace ( ' . ' , ' ' , 1 ) . isdigit ( ) ) and ( row [ 1 ] == ' ' ) and ( len ( row [ 2 ] . split ( ' . ' ) ) == 3 ) and ( row [ 3 ] == ' ' ) and ( len ( row [ 4 ] . split ( ' . ' ) ) == 3 ) and len ( row [ 2 ] . rsplit ( ' . ' , 1 ) [ - 1 ] ) == 2 and len ( row [ 4 ] . rsplit ( ' . ' , 1 ) [ - 1 ] ) == 2 :
split_value_3 = row [ 2 ] . split ( ' . ' )
split_value_5 = row [ 4 ] . split ( ' . ' )
if len ( split_value_3 ) == 3 :
new_value2 = f " { split_value_3 [ 0 ] } . { split_value_3 [ 1 ] [ : 2 ] } "
new_value3 = f " { split_value_3 [ 1 ] [ 2 : ] } . { split_value_3 [ 2 ] } "
if len ( split_value_5 ) == 3 :
new_value4 = f " { split_value_5 [ 0 ] } . { split_value_5 [ 1 ] [ : 2 ] } "
new_value5 = f " { split_value_5 [ 1 ] [ 2 : ] } . { split_value_5 [ 2 ] } "
row [ 1 ] = new_value2
row [ 2 ] = new_value3
row [ 3 ] = new_value4
row [ 4 ] = new_value5
#检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
if len ( row ) > = 4 and ( not row [ 0 ] . replace ( ' . ' , ' ' , 1 ) . isdigit ( ) ) and ( row [ 1 ] == ' ' ) and ( len ( row [ 2 ] . split ( ' . ' ) ) == 3 ) and len ( row [ 2 ] . rsplit ( ' . ' , 1 ) [ - 1 ] ) == 2 and ( row [ 3 ] . replace ( ' - ' , ' ' , 1 ) . replace ( ' . ' , ' ' , 1 ) . isdigit ( ) ) :
split_values = row [ 2 ] . split ( ' . ' )
if len ( split_values ) == 3 :
new_value2 = f " { split_values [ 0 ] } . { split_values [ 1 ] [ : 2 ] } "
new_value3 = f " { split_values [ 1 ] [ 2 : ] } . { split_values [ 2 ] } "
row [ 1 ] = new_value2
row [ 2 ] = new_value3
#检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
if not row [ 0 ] . replace ( ' . ' , ' ' , 1 ) . isdigit ( ) :
for i in range ( 1 , len ( row ) - 1 ) :
if row [ i ] == ' ' and ' % ' in row [ i + 1 ] and len ( row [ i + 1 ] . split ( ' % ' ) ) == 2 :
split_values = row [ i + 1 ] . split ( ' % ' )
new_value1 = f " { split_values [ 0 ] } % "
new_value2 = f " { split_values [ 1 ] } "
row [ i ] = new_value1
row [ i + 1 ] = new_value2
break
new_data = arr . tolist ( ) #用于后面保存到数据库中
rows , cols = arr . shape
if rows == 1 and cols == 1 :
continue
arr_str = ' ' . join ( [ ' ' . join ( map ( str , row ) ) for row in arr ] )
#过滤掉不包含需抽取指标表格的文本
matches = re . findall ( STR_PATTERN , arr_str )
pattern = re . findall ( PATTERN , arr_str )
muilt_pattern = re . findall ( MUILT_PATTERN , arr_str )
if len ( matches ) > 0 and len ( pattern ) == 0 and len ( muilt_pattern ) < 5 :
if not tables_range . get ( page_num ) :
tables_range [ page_num ] = [ ]
tables_range [ page_num ] . append ( {
' top ' : top ,
' buttom ' : buttom ,
' table_index ' : table_index ,
' page_num ' : page_num ,
} )
2025-09-02 15:23:55 +08:00
logger . debug ( f " tables_range的值是 { tables_range } " )
2024-11-29 15:58:06 +08:00
# db_service.insert_pdf_parse_process({
# 'file_id': file_id,
# 'page_num' : page_num,
# 'page_count' : 100,
# 'type' : 'parse_table',
# 'content':{
# 'top' : top,
# 'buttom' : buttom,
# 'page_num' : page_num,
# 'table_index' : table_index,
# "type" : "table",
# "data" : new_data,
# 'sort_num' : page_num*1000 - top
# }},conn,cursor)
# get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client)
# cursor.close()
# conn.close()
# redis_client.close()
end = time . time ( )
2025-09-02 15:23:55 +08:00
logger . info ( ' Task %s runs %0.2f seconds. ' % ( f ' 解析表格 { pages } ' , ( end - start ) ) )
2024-11-29 15:58:06 +08:00
get_table_range_test ( file_path , file_id , pages , tables_range )