pdf_code/zzb_data_prod/test_0711_v2.py

326 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import camelot
import re
#from multiprocessing import Pool
import os, time, random
import json
#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
from datetime import datetime
# 读取PDF
import PyPDF2
# 分析PDF的layout提取文本
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import mysql.connector
#import utils
from pymilvus import MilvusClient
#import llm_service
#import db_service
#import pdf_title
import numpy as np
#from multiprocessing import Process
STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
#负责表内一旦出现某个字符,整个表丢弃
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类'
#unit_pattern = re.compile(r'单位[|:]?(百万元|千万元|亿元|万元|千元|元)')
MUILT_PATTERN = '调整前'
file_path = r"combined_v61.pdf"
file_id = 1
pages = '1-2'
tables_range = {}
# def get_table_range_test(file_path, file_id, pages, tables_range):
# print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
# #(f'file_path: {file_path},file_id:{file_id},pages:{pages},tables_range:{tables_range}')
# start = time.time()
# import tempfile
# temp_dir_path = "F:\\temp"
# # 检查并创建临时文件夹
# if not os.path.exists(temp_dir_path):
# os.makedirs(temp_dir_path)
# # 创建临时文件夹
# temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
# # 设置全局临时文件夹路径
# os.environ["TMP"] = temp_dir
# os.environ["TEMP"] = temp_dir
# # conn = mysql.connector.connect(
# # host= MYSQL_HOST,
# # user= MYSQL_USER,
# # password= MYSQL_PASSWORD,
# # database= MYSQL_DB
# # )
# # 创建一个cursor对象来执行SQL语句
# #print(f'file_path的值是{file_path}')
# #cursor = conn.cursor()
# # try:
# # tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
# # print('读取成功')
# # except Exception as e:
# # print(f'错误在{e}')
# #print(f'file_path的值是{file_path}')
# #file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
# os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"
# try:
# # 确保 file_path 是正确的,并且文件是可访问的
# if not os.path.exists(file_path):
# print(f'文件路径不正确或文件不存在: {file_path}')
# raise FileNotFoundError(f"文件不存在:{file_path}")
# else:
# pass#(f'file_path是存在的就是{file_path}')
# # 读取 PDF 文件
# #tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
# #tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
# tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)#line_scale=10,
# print('读取成功')
# print("检测到的表格数量:", tables.n)
# except FileNotFoundError as fe:
# print(fe)
# except Exception as e:
# print(f'处理PDF时出错: {e}')
# for t in tables:
# top = t._bbox[3]
# buttom = t._bbox[1]
# page_num = int(t.page)
# table_index = int(t.order)
# arr = np.array(t.data)
# #recent_value = None
# #这里开始对可能解析错误的值做判断:
# for i, row in enumerate(arr):
# if len(row) >= 4:
# # first_value = row[0]
# # if ("2023年度" in first_value or "2022年度" in first_value) and len(first_value) <= 12:
# # recent_value = first_value
# # if first_value == '' and recent_value:
# # row[0] = recent_value
# # 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
# if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
# split_values = row[2].split('.')
# # 确保可以正确拆分成三个数值
# if len(split_values) == 4:
# new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
# new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
# new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
# row[1] = new_value1
# row[2] = new_value2
# row[3] = new_value3
# #检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
# if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
# split_value_3 = row[2].split('.')
# split_value_5 = row[4].split('.')
# if len(split_value_3) == 3:
# new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
# new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
# if len(split_value_5) == 3:
# new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
# new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
# row[1] = new_value2
# row[2] = new_value3
# row[3] = new_value4
# row[4] = new_value5
# #检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
# if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
# split_values = row[2].split('.')
# if len(split_values) == 3:
# new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
# new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
# row[1] = new_value2
# row[2] = new_value3
# #检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
# if not row[0].replace('.', '', 1).isdigit():
# for i in range(1, len(row) - 1):
# if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
# split_values = row[i + 1].split('%')
# new_value1 = f"{split_values[0]}%"
# new_value2 = f"{split_values[1]}"
# row[i] = new_value1
# row[i + 1] = new_value2
# break
# #检查条件当一个列表中同时出现了2022年12月31日和2023年1月1日时【并且都只出现1次】在2022年12月31日后面增加“调整前”字段
# # if sum(1 for item in row if item.strip() == "2023年1月1日") == 1 and sum(1 for item in row if item.strip() == "2022年12月31日") == 1:
# # for i, item in enumerate(row):
# # stripped_item = item.strip() #去空格
# # if stripped_item == "2022年12月31日":
# # row[i] = stripped_item + '调整前'
# new_data = arr.tolist()#用于后面保存到数据库中
# rows, cols = arr.shape
# if rows == 1 and cols == 1:
# continue
# arr_str = ''.join([''.join(map(str, row)) for row in arr])
# #print(f'arr_str的值是 {arr_str}')
# #过滤掉不包含需抽取指标表格的文本
# matches = re.findall(STR_PATTERN, arr_str)
# pattern = re.findall(PATTERN,arr_str)
# muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
# if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
# if not tables_range.get(page_num):
# tables_range[page_num] = []
# tables_range[page_num].append({
# 'top' : top,
# 'buttom' : buttom,
# 'table_index' : table_index,
# 'page_num' : page_num,
# })
# print(f"tables_range的值是{tables_range}")
# #(f'file_id是{file_id}')
# # db_service.insert_pdf_parse_process({
# # 'file_id': file_id,
# # 'page_num' : page_num,
# # 'page_count' : 100,
# # 'type' : 'parse_table',
# # 'content':{
# # 'top' : top,
# # 'buttom' : buttom,
# # 'page_num' : page_num,
# # 'table_index' : table_index,
# # "type" : "table",
# # "data" : new_data,
# # 'sort_num' : page_num*1000 - top
# # }},conn,cursor)
# #get_text_content(file_path, file_id, tables_range, pages, conn, cursor)
# # cursor.close()
# # conn.close()
# end = time.time()
# print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def get_table_range_test(file_path, file_id, pages, tables_range):
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
start = time.time()
# conn = mysql.connector.connect(
# host= MYSQL_HOST,
# user= MYSQL_USER,
# password= MYSQL_PASSWORD,
# database= MYSQL_DB
# )
# 创建一个cursor对象来执行SQL语句
#cursor = conn.cursor()
#redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
for t in tables:
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
arr = np.array(t.data)
#这里开始对可能解析错误的值做判断:
for i, row in enumerate(arr):
if len(row) >= 4:
# 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】
if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
split_values = row[2].split('.')
# 确保可以正确拆分成三个数值
if len(split_values) == 4:
new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
row[1] = new_value1
row[2] = new_value2
row[3] = new_value3
#检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】
if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2:
split_value_3 = row[2].split('.')
split_value_5 = row[4].split('.')
if len(split_value_3) == 3:
new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
if len(split_value_5) == 3:
new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
row[1] = new_value2
row[2] = new_value3
row[3] = new_value4
row[4] = new_value5
#检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】
if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
split_values = row[2].split('.')
if len(split_values) == 3:
new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
row[1] = new_value2
row[2] = new_value3
#检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分
if not row[0].replace('.', '', 1).isdigit():
for i in range(1, len(row) - 1):
if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
split_values = row[i + 1].split('%')
new_value1 = f"{split_values[0]}%"
new_value2 = f"{split_values[1]}"
row[i] = new_value1
row[i + 1] = new_value2
break
new_data = arr.tolist()#用于后面保存到数据库中
rows, cols = arr.shape
if rows == 1 and cols == 1:
continue
arr_str = ''.join([''.join(map(str, row)) for row in arr])
#过滤掉不包含需抽取指标表格的文本
matches = re.findall(STR_PATTERN, arr_str)
pattern = re.findall(PATTERN,arr_str)
muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
print(f"tables_range的值是{tables_range}")
# db_service.insert_pdf_parse_process({
# 'file_id': file_id,
# 'page_num' : page_num,
# 'page_count' : 100,
# 'type' : 'parse_table',
# 'content':{
# 'top' : top,
# 'buttom' : buttom,
# 'page_num' : page_num,
# 'table_index' : table_index,
# "type" : "table",
# "data" : new_data,
# 'sort_num' : page_num*1000 - top
# }},conn,cursor)
# get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client)
# cursor.close()
# conn.close()
# redis_client.close()
end = time.time()
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
get_table_range_test(file_path, file_id, pages, tables_range)