241 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			241 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			Python
		
	
	
	
| import PyPDF2
 | ||
| import re
 | ||
| import os,threading
 | ||
| from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
 | ||
| import redis
 | ||
| import db_service
 | ||
| def get_tree_pages(root, info, depth=0,title_array=[]):
 | ||
|     """
 | ||
|         Recursively iterate the outline tree
 | ||
|         Find the pages pointed by the outline item
 | ||
|         and get the assigned physical order id
 | ||
| 
 | ||
|         Decrement with padding if necessary
 | ||
|     """
 | ||
|     
 | ||
|     if isinstance(root, dict):
 | ||
|         # print(root)
 | ||
|         page = root['/Page'].get_object()
 | ||
|         # print(id(page))
 | ||
|         t = root['/Title']
 | ||
|         title = t
 | ||
|         if isinstance(t, PyPDF2.generic.ByteStringObject):
 | ||
|             title = t.original_bytes.decode('utf8')
 | ||
|         title = title.strip()
 | ||
|         title = title.replace('\n', '')
 | ||
|         title = title.replace('\r', '')
 | ||
| 
 | ||
|         page_num = info['all_pages'].get(id(page), 0)
 | ||
|         if page_num == 0:
 | ||
|             print('Not found page number for /Page!', page)
 | ||
|         elif page_num < info['padding']:
 | ||
|             page_num = 0
 | ||
|         else:
 | ||
|             page_num -= info['padding']
 | ||
| 
 | ||
| 
 | ||
|         # str_val = '%-5d' % page_num
 | ||
|         # str_val += '\t' * depth
 | ||
|         # str_val += title + '\t' + '%3d' % page_num
 | ||
|         # print(str_val)
 | ||
|         title_array.append({
 | ||
|             'title': title,
 | ||
|             'page_num': page_num,
 | ||
|             'depth': depth
 | ||
|         })
 | ||
|     for elem in root:
 | ||
|         get_tree_pages(elem, info, depth+1,title_array)
 | ||
|     return title_array
 | ||
| 
 | ||
| 
 | ||
| def recursive_numbering(obj, info):
 | ||
|     """
 | ||
|         Recursively iterate through all the pages in order and assign them a physical
 | ||
|         order number
 | ||
|     """
 | ||
|     # print(id(obj), obj)
 | ||
|     if obj['/Type'] == '/Page':
 | ||
|         obj_id = id(obj)
 | ||
|         if obj_id not in info['all_pages']:
 | ||
|             info['all_pages'][obj_id] = info['current_page_id']
 | ||
|         info['current_page_id'] += 1
 | ||
|         return
 | ||
|     elif obj['/Type'] == '/Pages':
 | ||
|         for page in obj['/Kids']:
 | ||
|             recursive_numbering(page.get_object(), info)
 | ||
| 
 | ||
| def get_numbers_between(numbers_between,start, end):
 | ||
|     # 初始化一个空列表来存储两个数字之间的所有数字
 | ||
| 
 | ||
|     # 遍历从开始数字到结束数字之间的每个数字
 | ||
|     for i in range(start, end + 1):
 | ||
|         # 将每个数字添加到列表中
 | ||
|         numbers_between.append(i)  
 | ||
|     return numbers_between
 | ||
| 
 | ||
| def get_page_end(start, depth, title_array):
 | ||
|     page_end = -1
 | ||
|     for i in range(start, len(title_array)):
 | ||
|         if title_array[i]['depth'] == depth:
 | ||
|             page_end = title_array[i]['page_num']
 | ||
|             break
 | ||
|     return page_end
 | ||
| 
 | ||
| def get_file_split(page_count):
 | ||
|     # 获取 CPU 核数
 | ||
|     cpu_count = os.cpu_count()
 | ||
|     if page_count < cpu_count:
 | ||
|         cpu_count = page_count
 | ||
|     # 使用 divmod() 函数计算除法结果和余数
 | ||
|     quotient, remainder = divmod(page_count, cpu_count)
 | ||
|     table_split_parts = []
 | ||
|     text_split_parts = []
 | ||
|     for i in range(cpu_count):
 | ||
|         start_num = i * quotient
 | ||
|         if i < cpu_count-1:
 | ||
|             start_num = i * quotient
 | ||
|             end_num = start_num+quotient
 | ||
|         else:
 | ||
|             end_num = page_count
 | ||
|         table_split_parts.append(f'{start_num}-{end_num}')
 | ||
|         text_split_parts.append(get_numbers_between([],start_num, end_num))
 | ||
| 
 | ||
|     # 返回除法结果和余数
 | ||
|     return {
 | ||
|         'table_split_parts': table_split_parts,
 | ||
|         'text_split_parts': text_split_parts
 | ||
|     }
 | ||
|     
 | ||
| def create_text_outline(pdf_path, file_id):
 | ||
|     # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
 | ||
|     # creating an object 
 | ||
|     with open(pdf_path, 'rb') as file:
 | ||
|         file_info = {}
 | ||
|         fileReader = PyPDF2.PdfReader(file)
 | ||
|         page_count = len(fileReader.pages)
 | ||
| 
 | ||
|         redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
 | ||
|         redis_client.set(f'page_count_{file_id}', page_count)
 | ||
| 
 | ||
|         info = {
 | ||
|             'page_count': page_count,
 | ||
|             'all_pages': {},
 | ||
|             'current_page_id': 1, 
 | ||
|             'padding': 0
 | ||
|         }
 | ||
| 
 | ||
|         print('Number of pages: %d' % info['page_count'])
 | ||
| 
 | ||
|         pages = fileReader.trailer['/Root']['/Pages'].get_object()
 | ||
|         recursive_numbering(pages, info)
 | ||
|         #for page_num, page in enumerate(pages['/Kids']):
 | ||
|         #    page_obj = page.getObject()
 | ||
|         #    all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
 | ||
|         title_array = get_tree_pages(fileReader.outline, info, 0, [])
 | ||
|         db_service.pdf_title_insert_mysql(file_id,title_array)
 | ||
|         title_array = db_service.get_file_info_from_mysql(file_id)
 | ||
|         
 | ||
|         parent_table_pages_local = {}
 | ||
|         parent_table_pages_local[file_id] = []
 | ||
|         print(f'{file_id}:{len(title_array)}')
 | ||
|         for i in range(len(title_array)):
 | ||
|             title_obj = title_array[i]
 | ||
|             title  = title_obj['title']
 | ||
|             #print(f'标题分别是{title}')
 | ||
|             if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
 | ||
|                 page_start = title_obj['page_num']
 | ||
|                 depth = title_obj['depth']
 | ||
|                 if i < len(title_array) - 1:
 | ||
|                     page_end = title_array[i+1]['page_num']
 | ||
|                     if title_array[i]['depth'] in [1,2]:
 | ||
|                         page_end = get_page_end(i+1, depth, title_array)
 | ||
|                 else:
 | ||
|                     page_end = page_count
 | ||
|                 print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
 | ||
|                 
 | ||
|                 #当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
 | ||
|                 if len(re.findall('财务报表主要项目注释', title)) == 0:
 | ||
|                     page_end = page_end - 1
 | ||
|                 # print(title,page_start,page_end)
 | ||
|                 for i in range(page_start, page_end + 1):
 | ||
|                     # 将每个数字添加到列表中
 | ||
|                     parent_table_pages_local[file_id].append(i) 
 | ||
|         file_info['page_count'] = page_count
 | ||
|         file_info['parent_table_pages'] = parent_table_pages_local[file_id]
 | ||
|         file_info['split_parts'] = get_file_split(page_count)
 | ||
| 
 | ||
|         redis_client.close()
 | ||
| 
 | ||
|         return file_info
 | ||
| 
 | ||
| 
 | ||
| def create_text_outline_disclosure(pdf_path, file_id):
 | ||
|     # print('Running the script for [%s] with padding [%d]' % (pdf_path, page_number_padding))
 | ||
|     # creating an object
 | ||
|     with open(pdf_path, 'rb') as file:
 | ||
|         file_info = {}
 | ||
|         fileReader = PyPDF2.PdfReader(file)
 | ||
|         page_count = len(fileReader.pages)
 | ||
| 
 | ||
|         redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)
 | ||
|         redis_client.set(f'page_count_{file_id}', page_count)
 | ||
| 
 | ||
|         info = {
 | ||
|             'page_count': page_count,
 | ||
|             'all_pages': {},
 | ||
|             'current_page_id': 1,
 | ||
|             'padding': 0
 | ||
|         }
 | ||
| 
 | ||
|         print('Number of pages: %d' % info['page_count'])
 | ||
| 
 | ||
|         pages = fileReader.trailer['/Root']['/Pages'].get_object()
 | ||
|         recursive_numbering(pages, info)
 | ||
|         #for page_num, page in enumerate(pages['/Kids']):
 | ||
|         #    page_obj = page.getObject()
 | ||
|         #    all_pages[id(page_obj)] = page_num + 1 # who starts counting from 0 anyways?
 | ||
|         title_array = get_tree_pages(fileReader.outline, info, 0, [])
 | ||
|         #db_service.pdf_title_insert_mysql(file_id,title_array)
 | ||
|         #title_array = db_service.get_file_info_from_mysql(file_id)
 | ||
| 
 | ||
|         parent_table_pages_local = {}
 | ||
|         parent_table_pages_local[file_id] = []
 | ||
|         print(f'{file_id}:{len(title_array)}')
 | ||
|         for i in range(len(title_array)):
 | ||
|             title_obj = title_array[i]
 | ||
|             title  = title_obj['title']
 | ||
|             #print(f'标题分别是{title}')
 | ||
|             if len(re.findall('母公司|现金流量表补充|重要会计政策|会计估计变更|公允价值的披露|合营安排或联营企业中的权益|与金融工具相关的风险|税项|主要控股参股公司|结构化主体情况|公司股份总数及股东结构变动及公司资产和负债结构的变动情况|所有权或使用权受到限制的资产|在建工程|固定资产|其他主体中的权益|分部信息|与金融工具相关的风险|其他关联交易|公司子公司重大事项', title)) >0 :
 | ||
|                 page_start = title_obj['page_num']
 | ||
|                 depth = title_obj['depth']
 | ||
|                 if i < len(title_array) - 1:
 | ||
|                     page_end = title_array[i+1]['page_num']
 | ||
|                     if title_array[i]['depth'] in [1,2]:
 | ||
|                         page_end = get_page_end(i+1, depth, title_array)
 | ||
|                 else:
 | ||
|                     page_end = page_count
 | ||
|                 print(f'目录识别时被丢弃的页码:{page_start}-{page_end}')
 | ||
| 
 | ||
|                 #当标题为母公司财务报表主要项目注释时,最后一页不过滤,避免核心roe指标无法召回
 | ||
|                 if len(re.findall('财务报表主要项目注释', title)) == 0:
 | ||
|                     page_end = page_end - 1
 | ||
|                 # print(title,page_start,page_end)
 | ||
|                 for i in range(page_start, page_end + 1):
 | ||
|                     # 将每个数字添加到列表中
 | ||
|                     parent_table_pages_local[file_id].append(i)
 | ||
|         file_info['page_count'] = page_count
 | ||
|         file_info['parent_table_pages'] = parent_table_pages_local[file_id]
 | ||
|         file_info['split_parts'] = get_file_split(page_count)
 | ||
| 
 | ||
|         redis_client.close()
 | ||
| 
 | ||
|         return file_info
 | ||
| if __name__ == '__main__':
 | ||
|     import time
 | ||
|     path = "/Users/zhengfei/Desktop/cb/2023年报检测/安妮股份.pdf"
 | ||
| 
 | ||
|     threading.Thread(target=create_text_outline, args=(path,'111')).start()
 | ||
|     time.sleep(5)
 | ||
|     threading.Thread(target=create_text_outline, args=(path,'222')).start()
 | ||
| 
 |