import PyPDF2 import re import os,threading from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD import redis import db_service def get_tree_pages(root, info, depth=0,title_array=[]): """ Recursively iterate the outline tree Find the pages pointed by the outline item and get the assigned physical order id Decrement with padding if necessary """ if isinstance(root, dict): # print(root) page = root['/Page'].get_object() # print(id(page)) t = root['/Title'] title = t if isinstance(t, PyPDF2.generic.ByteStringObject): title = t.original_bytes.decode('utf8') title = title.strip() title = title.replace('\n', '') title = title.replace('\r', '') page_num = info['all_pages'].get(id(page), 0) if page_num == 0: print('Not found page number for /Page!', page) elif page_num < info['padding']: page_num = 0 else: page_num -= info['padding'] # str_val = '%-5d' % page_num # str_val += '\t' * depth # str_val += title + '\t' + '%3d' % page_num # print(str_val) title_array.append({ 'title': title, 'page_num': page_num, 'depth': depth }) for elem in root: get_tree_pages(elem, info, depth+1,title_array) return title_array def recursive_numbering(obj, info): """ Recursively iterate through all the pages in order and assign them a physical order number """ # print(id(obj), obj) if obj['/Type'] == '/Page': obj_id = id(obj) if obj_id not in info['all_pages']: info['all_pages'][obj_id] = info['current_page_id'] info['current_page_id'] += 1 return elif obj['/Type'] == '/Pages': for page in obj['/Kids']: recursive_numbering(page.get_object(), info) def get_numbers_between(numbers_between,start, end): # 初始化一个空列表来存储两个数字之间的所有数字 # 遍历从开始数字到结束数字之间的每个数字 for i in range(start, end + 1): # 将每个数字添加到列表中 numbers_between.append(i) return numbers_between def get_page_end(start, depth, title_array): page_end = -1 for i in range(start, len(title_array)): if title_array[i]['depth'] == depth: page_end = title_array[i]['page_num'] break return page_end def get_file_split(page_count): # 获取 CPU 核数 cpu_count = os.cpu_count() if page_count < cpu_count: cpu_count = page_count # 使用 divmod() 函数计算除法结果和余数 quotient, remainder = divmod(page_count, cpu_count) table_split_parts = [] text_split_parts = [] for i in range(cpu_count): start_num = i * quotient if i < cpu_count-1: start_num = i * quotient end_num = start_num+quotient else: end_num = page_count table_split_parts.append(f'{start_num}-{end_num}') text_split_parts.append(get_numbers_between([],start_num, end_num)) # 返回除法结果和余数 return { 'table_split_parts': table_split_parts, 'text_split_parts': text_split_parts } def create_text_outline(pdf_path, file_id): # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding)) # creating an object with open(pdf_path, 'rb') as file: file_info = {} fileReader = PyPDF2.PdfReader(file) page_count = len(fileReader.pages) redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) redis_client.set(f'page_count_{file_id}', page_count) info = { 'page_count': page_count, 'all_pages': {}, 'current_page_id': 1, 'padding': 0 } print('Number of pages: %d' % info['page_count']) pages = fileReader.trailer['/Root']['/Pages'].get_object() recursive_numbering(pages, info) #for page_num, page in enumerate(pages['/Kids']): # page_obj = page.getObject() # all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways? title_array = get_tree_pages(fileReader.outline, info, 0, []) db_service.pdf_title_insert_mysql(file_id,title_array) title_array = db_service.get_file_info_from_mysql(file_id) parent_table_pages_local = {} parent_table_pages_local[file_id] = [] print(f'{file_id}:{len(title_array)}') for i in range(len(title_array)): title_obj = title_array[i] title = title_obj['title'] #print(f'标题分别是{title}') if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 : page_start = title_obj['page_num'] depth = title_obj['depth'] if i < len(title_array) - 1: page_end = title_array[i+1]['page_num'] if title_array[i]['depth'] in [1,2]: page_end = get_page_end(i+1, depth, title_array) else: page_end = page_count print(f'目录识别时被丢弃的页码:{page_start}-{page_end}') #当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回 if len(re.findall('财务报表主要项目注释', title)) == 0: page_end = page_end - 1 # print(title,page_start,page_end) for i in range(page_start, page_end + 1): # 将每个数字添加到列表中 parent_table_pages_local[file_id].append(i) file_info['page_count'] = page_count file_info['parent_table_pages'] = parent_table_pages_local[file_id] file_info['split_parts'] = get_file_split(page_count) redis_client.close() return file_info def create_text_outline_disclosure(pdf_path, file_id): # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding)) # creating an object with open(pdf_path, 'rb') as file: file_info = {} fileReader = PyPDF2.PdfReader(file) page_count = len(fileReader.pages) redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) redis_client.set(f'page_count_{file_id}', page_count) info = { 'page_count': page_count, 'all_pages': {}, 'current_page_id': 1, 'padding': 0 } print('Number of pages: %d' % info['page_count']) pages = fileReader.trailer['/Root']['/Pages'].get_object() recursive_numbering(pages, info) #for page_num, page in enumerate(pages['/Kids']): # page_obj = page.getObject() # all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways? title_array = get_tree_pages(fileReader.outline, info, 0, []) #db_service.pdf_title_insert_mysql(file_id,title_array) #title_array = db_service.get_file_info_from_mysql(file_id) parent_table_pages_local = {} parent_table_pages_local[file_id] = [] print(f'{file_id}:{len(title_array)}') for i in range(len(title_array)): title_obj = title_array[i] title = title_obj['title'] #print(f'标题分别是{title}') if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 : page_start = title_obj['page_num'] depth = title_obj['depth'] if i < len(title_array) - 1: page_end = title_array[i+1]['page_num'] if title_array[i]['depth'] in [1,2]: page_end = get_page_end(i+1, depth, title_array) else: page_end = page_count print(f'目录识别时被丢弃的页码:{page_start}-{page_end}') #当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回 if len(re.findall('财务报表主要项目注释', title)) == 0: page_end = page_end - 1 # print(title,page_start,page_end) for i in range(page_start, page_end + 1): # 将每个数字添加到列表中 parent_table_pages_local[file_id].append(i) file_info['page_count'] = page_count file_info['parent_table_pages'] = parent_table_pages_local[file_id] file_info['split_parts'] = get_file_split(page_count) redis_client.close() return file_info if __name__ == '__main__': import time path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf" threading.Thread(target=create_text_outline, args=(path,'111')).start() time.sleep(5) threading.Thread(target=create_text_outline, args=(path,'222')).start()