pdf_code/zzb_data_prod/space/pdf_title.py

176 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import PyPDF2
import re
import os,threading
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
import redis
def get_tree_pages(root, info, depth=0,title_array=[]):
"""
Recursively iterate the outline tree
Find the pages pointed by the outline item
and get the assigned physical order id
Decrement with padding if necessary
"""
if isinstance(root, dict):
# print(root)
page = root['/Page'].get_object()
# print(id(page))
t = root['/Title']
title = t
if isinstance(t, PyPDF2.generic.ByteStringObject):
title = t.original_bytes.decode('utf8')
title = title.strip()
title = title.replace('\n', '')
title = title.replace('\r', '')
page_num = info['all_pages'].get(id(page), 0)
if page_num == 0:
print('Not found page number for /Page!', page)
elif page_num < info['padding']:
page_num = 0
else:
page_num -= info['padding']
# str_val = '%-5d' % page_num
# str_val += '\t' * depth
# str_val += title + '\t' + '%3d' % page_num
# print(str_val)
title_array.append({
'title': title,
'page_num': page_num,
'depth': depth
})
for elem in root:
get_tree_pages(elem, info, depth+1,title_array)
return title_array
def recursive_numbering(obj, info):
"""
Recursively iterate through all the pages in order and assign them a physical
order number
"""
# print(id(obj), obj)
if obj['/Type'] == '/Page':
obj_id = id(obj)
if obj_id not in info['all_pages']:
info['all_pages'][obj_id] = info['current_page_id']
info['current_page_id'] += 1
return
elif obj['/Type'] == '/Pages':
for page in obj['/Kids']:
recursive_numbering(page.get_object(), info)
def get_numbers_between(numbers_between,start, end):
# 初始化一个空列表来存储两个数字之间的所有数字
# 遍历从开始数字到结束数字之间的每个数字
for i in range(start, end + 1):
# 将每个数字添加到列表中
numbers_between.append(i)
return numbers_between
def get_page_end(start, depth, title_array):
page_end = -1
for i in range(start, len(title_array)):
if title_array[i]['depth'] == depth:
page_end = title_array[i]['page_num']
break
return page_end
def get_file_split(page_count):
# 获取 CPU 核数
cpu_count = os.cpu_count()
if page_count < cpu_count:
cpu_count = page_count
# 使用 divmod() 函数计算除法结果和余数
quotient, remainder = divmod(page_count, cpu_count)
table_split_parts = []
text_split_parts = []
for i in range(cpu_count):
start_num = i * quotient
if i < cpu_count-1:
start_num = i * quotient
end_num = start_num+quotient
else:
end_num = page_count
table_split_parts.append(f'{start_num}-{end_num}')
text_split_parts.append(get_numbers_between([],start_num, end_num))
# 返回除法结果和余数
return {
'table_split_parts': table_split_parts,
'text_split_parts': text_split_parts
}
def create_text_outline(pdf_path, file_id):
# print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
# creating an object
with open(pdf_path, 'rb') as file:
file_info = {}
fileReader = PyPDF2.PdfReader(file)
page_count = len(fileReader.pages)
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
redis_client.set(f'page_count_{file_id}', page_count)
info = {
'page_count': page_count,
'all_pages': {},
'current_page_id': 1,
'padding': 0
}
print('Number of pages: %d' % info['page_count'])
pages = fileReader.trailer['/Root']['/Pages'].get_object()
recursive_numbering(pages, info)
#for page_num, page in enumerate(pages['/Kids']):
# page_obj = page.getObject()
# all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
title_array = get_tree_pages(fileReader.outline, info, 0, [])
parent_table_pages_local = {}
parent_table_pages_local[file_id] = []
print(f'{file_id}:{len(title_array)}')
for i in range(len(title_array)):
title_obj = title_array[i]
title = title_obj['title']
#print(f'标题分别是{title}')
if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项', title)) >0 :
page_start = title_obj['page_num']
depth = title_obj['depth']
if i < len(title_array) - 1:
page_end = title_array[i+1]['page_num']
if title_array[i]['depth'] in [1,2]:
page_end = get_page_end(i+1, depth, title_array)
else:
page_end = page_count
print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
#当标题为母公司财务报表主要项目注释时最后一页不过滤避免核心roe指标无法召回
if len(re.findall('财务报表主要项目注释', title)) == 0:
page_end = page_end - 1
# print(title,page_start,page_end)
for i in range(page_start, page_end + 1):
# 将每个数字添加到列表中
parent_table_pages_local[file_id].append(i)
file_info['page_count'] = page_count
file_info['parent_table_pages'] = parent_table_pages_local[file_id]
file_info['split_parts'] = get_file_split(page_count)
redis_client.close()
return file_info
if __name__ == '__main__':
import time
path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
threading.Thread(target=create_text_outline, args=(path,'111')).start()
time.sleep(5)
threading.Thread(target=create_text_outline, args=(path,'222')).start()