pdf_code/zzb_data_word/camelot_tables.py

251 lines
9.6 KiB
Python
Raw Normal View History

2024-12-30 17:51:12 +08:00
import camelot
import time
import re
import numpy as np
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import json
import utils
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i : i + n]
def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
"""
Divide the extraction work into n chunks. At the end of every chunk,
save data on disk and free RAM.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
"""
# get list of pages from camelot.handlers.PDFHandler
handler = camelot.handlers.PDFHandler(filepath)
page_list = handler._get_pages(pages=pages_num)
# chunk pages list
page_chunks = list(chunks(page_list, chunk_num))
# extraction and export
for chunk in page_chunks:
pages_string = str(chunk).replace("[", "").replace("]", "")
tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
tables.export(f"{export_path}/tables.csv")
# 读取pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
def get_pdf_info(file_path, pages):
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
pdf_info = []
tables_range = {}
for table_num, t in enumerate(tables):
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
arr = np.array(t.data)
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
pdf_info.append({
'top' : top,
'buttom' : buttom,
'page_num' : page_num,
'table_index' : table_index,
"type" : "table",
"data" : t.data,
'sort_num' : page_num*1000 - top
})
for pagenum, page in enumerate(extract_pages(file_path)):
page_elements = [(element.y1, element) for element in page._objs]
# 查找组成页面的元素
for i,component in enumerate(page_elements):
text_type = 'text'
# 提取页面布局的元素
element = component[1]
# 检查该元素是否为文本元素
if isinstance(element, LTTextBoxHorizontal):
# 检查文本是否出现在表中
line_text = element.get_text().replace('\n','')
line_text = re.sub(r"\s", "", line_text)
element_top = element.bbox[3]
element_buttom = element.bbox[1]
# 检查该文本是否出现在表中
if tables_range.get(pagenum+1):
for range in tables_range[pagenum+1]:
# print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
if element_top < range['top'] and element_top > range['buttom']:
pass
else:
if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
if i == 0:
text_type = get_text_type(line_text)
if text_type == 'page_header':
break
if utils.check_table_title_black_list(line_text):
print(line_text)
pdf_info.append({
'top' : element_top,
'buttom' : element_buttom,
'page_num' : range['page_num'],
'table_index' : range['table_index'],
"type" : text_type,
'content' : line_text,
'sort_num' : range['page_num']*1000 - element_top
})
break
#处理母公司表格标题在页面底部,完整表格在下一页
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
text_type = get_text_type(line_text)
if text_type == 'page_footer':
continue
pdf_info.append({
'top' : element_top,
'buttom' : element_buttom,
'page_num' : pagenum+1,
"type" : text_type,
'content' : line_text,
'sort_num' : (pagenum+1)*1000 - element_top
})
# print(f'{element_top}: {element_buttom}: {line_text}')
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
for info in sorted_pdf_info:
print(info)
def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num):
for range in tables_range[page_num]:
if top < range['top'] and top > range['buttom']:
return True
return False
def get_text_type(text: str):
first_re = '年度报告'
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
if re.search(first_re, text.strip()):
return 'page_header'
if page_number_pattern.match(text.strip()):
return 'page_footer'
return 'text'
def find_continuous_numbers(numbers):
# 首先对数组进行排序
numbers.sort()
# 初始化新数组和连续序列的开始索引
new_numbers = []
start_index = 0
# 遍历排序后的数组
for i in range(1, len(numbers)):
# 检查当前数字是否与前一个数字不连续
if numbers[i] != numbers[i-1] + 1:
# 如果当前数字与前一个数字不连续,处理连续序列
if i - start_index > 1:
# 如果连续序列长度大于1将最小数和最大数用"-"连接
new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
else:
# 如果连续序列长度为1直接添加数字
new_numbers.append(str(numbers[start_index]))
if start_index == i - 1:
new_numbers.append(str(numbers[i-1]))
# 更新连续序列的开始索引
start_index = i
# 处理数组末尾的连续序列
if len(numbers) - start_index > 1:
new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
else:
new_numbers.append(str(numbers[start_index]))
if start_index < len(numbers) - 1:
new_numbers.append(str(numbers[-1]))
return new_numbers
def merge_consecutive_arrays(file_path):
merged_objects = []
temp_array = {}
# 打开文件并读取每一行
with open(file_path, 'r') as file:
for line in file:
# 去除行尾的换行符
line = line.strip()
# 尝试将行转换成JSON格式
try:
obj = eval(line)
if obj['type'] == 'table':
# 如果对象是数组,将其元素添加到临时列表中
if not temp_array.get('page_num'):
temp_array = obj
else:
temp_array['data'].extend(obj['data'])
else:
# 如果对象不是数组,检查临时列表是否为空
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
except json.JSONDecodeError as e:
print(f"Error decoding JSON line: {e}")
if temp_array:
merged_objects.append(temp_array)
# 关闭文件
file.close()
return merged_objects
if __name__ == "__main__":
# print(get_text_type('6/223 '.strip()))
# start = time.time()
get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
# end = time.time()
# print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
# 示例数组
# numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
# # 调用函数并打印结果
# print(find_continuous_numbers(numbers))
# 示例数组对象列表
# 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下:
# df1 = tables[0].df
# df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引
# df3 = tables[1].df
# df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0])
# df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并ignore_index=True,表根据列名对齐合并生成新的index
# print(df__2)
# 调用函数并打印结果
# print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))