import camelot import time import re import numpy as np from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextBoxHorizontal import pdfplumber import json import utils def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i : i + n] def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}): """ Divide the extraction work into n chunks. At the end of every chunk, save data on disk and free RAM. filepath : str Filepath or URL of the PDF file. pages : str, optional (default: '1') Comma-separated page numbers. Example: '1,3,4' or '1,4-end' or 'all'. """ # get list of pages from camelot.handlers.PDFHandler handler = camelot.handlers.PDFHandler(filepath) page_list = handler._get_pages(pages=pages_num) # chunk pages list page_chunks = list(chunks(page_list, chunk_num)) # extraction and export for chunk in page_chunks: pages_string = str(chunk).replace("[", "").replace("]", "") tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h']) tables.export(f"{export_path}/tables.csv") # 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx def get_pdf_info(file_path, pages): tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h']) pdf_info = [] tables_range = {} for table_num, t in enumerate(tables): top = t._bbox[3] buttom = t._bbox[1] page_num = int(t.page) table_index = int(t.order) arr = np.array(t.data) if not tables_range.get(page_num): tables_range[page_num] = [] tables_range[page_num].append({ 'top' : top, 'buttom' : buttom, 'table_index' : table_index, 'page_num' : page_num, }) pdf_info.append({ 'top' : top, 'buttom' : buttom, 'page_num' : page_num, 'table_index' : table_index, "type" : "table", "data" : t.data, 'sort_num' : page_num*1000 - top }) for pagenum, page in enumerate(extract_pages(file_path)): page_elements = [(element.y1, element) for element in page._objs] # 查找组成页面的元素 for i,component in enumerate(page_elements): text_type = 'text' # 提取页面布局的元素 element = component[1] # 检查该元素是否为文本元素 if isinstance(element, LTTextBoxHorizontal): # 检查文本是否出现在表中 line_text = element.get_text().replace('\n','') line_text = re.sub(r"\s", "", line_text) element_top = element.bbox[3] element_buttom = element.bbox[1] # 检查该文本是否出现在表中 if tables_range.get(pagenum+1): for range in tables_range[pagenum+1]: # print(f"{range['top']}: {range['buttom']}: {range['table_index']}") if element_top < range['top'] and element_top > range['buttom']: pass else: if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1): if i == 0: text_type = get_text_type(line_text) if text_type == 'page_header': break if utils.check_table_title_black_list(line_text): print(line_text) pdf_info.append({ 'top' : element_top, 'buttom' : element_buttom, 'page_num' : range['page_num'], 'table_index' : range['table_index'], "type" : text_type, 'content' : line_text, 'sort_num' : range['page_num']*1000 - element_top }) break #处理母公司表格标题在页面底部,完整表格在下一页 if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1): text_type = get_text_type(line_text) if text_type == 'page_footer': continue pdf_info.append({ 'top' : element_top, 'buttom' : element_buttom, 'page_num' : pagenum+1, "type" : text_type, 'content' : line_text, 'sort_num' : (pagenum+1)*1000 - element_top }) # print(f'{element_top}: {element_buttom}: {line_text}') sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num']) for info in sorted_pdf_info: print(info) def text_in_table(top, tables_range, page_num): if tables_range.get(page_num): for range in tables_range[page_num]: if top < range['top'] and top > range['buttom']: return True return False def get_text_type(text: str): first_re = '年度报告' page_number_pattern = re.compile(r'^\d+(/\d+)?$') if re.search(first_re, text.strip()): return 'page_header' if page_number_pattern.match(text.strip()): return 'page_footer' return 'text' def find_continuous_numbers(numbers): # 首先对数组进行排序 numbers.sort() # 初始化新数组和连续序列的开始索引 new_numbers = [] start_index = 0 # 遍历排序后的数组 for i in range(1, len(numbers)): # 检查当前数字是否与前一个数字不连续 if numbers[i] != numbers[i-1] + 1: # 如果当前数字与前一个数字不连续,处理连续序列 if i - start_index > 1: # 如果连续序列长度大于1,将最小数和最大数用"-"连接 new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}") else: # 如果连续序列长度为1,直接添加数字 new_numbers.append(str(numbers[start_index])) if start_index == i - 1: new_numbers.append(str(numbers[i-1])) # 更新连续序列的开始索引 start_index = i # 处理数组末尾的连续序列 if len(numbers) - start_index > 1: new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}") else: new_numbers.append(str(numbers[start_index])) if start_index < len(numbers) - 1: new_numbers.append(str(numbers[-1])) return new_numbers def merge_consecutive_arrays(file_path): merged_objects = [] temp_array = {} # 打开文件并读取每一行 with open(file_path, 'r') as file: for line in file: # 去除行尾的换行符 line = line.strip() # 尝试将行转换成JSON格式 try: obj = eval(line) if obj['type'] == 'table': # 如果对象是数组,将其元素添加到临时列表中 if not temp_array.get('page_num'): temp_array = obj else: temp_array['data'].extend(obj['data']) else: # 如果对象不是数组,检查临时列表是否为空 if temp_array: # 将临时列表中的元素合并成一个数组,并添加到新的对象列表中 merged_objects.append(temp_array) temp_array = {} # 重置临时列表 except json.JSONDecodeError as e: print(f"Error decoding JSON line: {e}") if temp_array: merged_objects.append(temp_array) # 关闭文件 file.close() return merged_objects if __name__ == "__main__": # print(get_text_type('6/223 '.strip())) # start = time.time() get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all') # end = time.time() # print('Task %s runs %0.2f seconds.' % ('223', (end - start))) # 示例数组 # numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39] # # 调用函数并打印结果 # print(find_continuous_numbers(numbers)) # 示例数组对象列表 # 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下: # df1 = tables[0].df # df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引 # df3 = tables[1].df # df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0]) # df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并,ignore_index=True,表根据列名对齐合并,生成新的index # print(df__2) # 调用函数并打印结果 # print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))