251 lines
9.6 KiB
Python
251 lines
9.6 KiB
Python
|
import camelot
|
|||
|
import time
|
|||
|
import re
|
|||
|
import numpy as np
|
|||
|
from pdfminer.high_level import extract_pages
|
|||
|
from pdfminer.layout import LTTextBoxHorizontal
|
|||
|
import pdfplumber
|
|||
|
import json
|
|||
|
import utils
|
|||
|
|
|||
|
def chunks(l, n):
|
|||
|
"""Yield successive n-sized chunks from l."""
|
|||
|
for i in range(0, len(l), n):
|
|||
|
yield l[i : i + n]
|
|||
|
|
|||
|
|
|||
|
def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
|
|||
|
"""
|
|||
|
Divide the extraction work into n chunks. At the end of every chunk,
|
|||
|
save data on disk and free RAM.
|
|||
|
|
|||
|
filepath : str
|
|||
|
Filepath or URL of the PDF file.
|
|||
|
pages : str, optional (default: '1')
|
|||
|
Comma-separated page numbers.
|
|||
|
Example: '1,3,4' or '1,4-end' or 'all'.
|
|||
|
"""
|
|||
|
|
|||
|
# get list of pages from camelot.handlers.PDFHandler
|
|||
|
handler = camelot.handlers.PDFHandler(filepath)
|
|||
|
page_list = handler._get_pages(pages=pages_num)
|
|||
|
|
|||
|
# chunk pages list
|
|||
|
page_chunks = list(chunks(page_list, chunk_num))
|
|||
|
|
|||
|
# extraction and export
|
|||
|
for chunk in page_chunks:
|
|||
|
pages_string = str(chunk).replace("[", "").replace("]", "")
|
|||
|
tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
|
|||
|
tables.export(f"{export_path}/tables.csv")
|
|||
|
|
|||
|
# 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
|||
|
def get_pdf_info(file_path, pages):
|
|||
|
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
|
|||
|
|
|||
|
pdf_info = []
|
|||
|
tables_range = {}
|
|||
|
|
|||
|
for table_num, t in enumerate(tables):
|
|||
|
|
|||
|
top = t._bbox[3]
|
|||
|
buttom = t._bbox[1]
|
|||
|
page_num = int(t.page)
|
|||
|
table_index = int(t.order)
|
|||
|
arr = np.array(t.data)
|
|||
|
if not tables_range.get(page_num):
|
|||
|
tables_range[page_num] = []
|
|||
|
|
|||
|
tables_range[page_num].append({
|
|||
|
'top' : top,
|
|||
|
'buttom' : buttom,
|
|||
|
'table_index' : table_index,
|
|||
|
'page_num' : page_num,
|
|||
|
})
|
|||
|
|
|||
|
pdf_info.append({
|
|||
|
'top' : top,
|
|||
|
'buttom' : buttom,
|
|||
|
'page_num' : page_num,
|
|||
|
'table_index' : table_index,
|
|||
|
"type" : "table",
|
|||
|
"data" : t.data,
|
|||
|
'sort_num' : page_num*1000 - top
|
|||
|
})
|
|||
|
|
|||
|
for pagenum, page in enumerate(extract_pages(file_path)):
|
|||
|
page_elements = [(element.y1, element) for element in page._objs]
|
|||
|
# 查找组成页面的元素
|
|||
|
for i,component in enumerate(page_elements):
|
|||
|
|
|||
|
text_type = 'text'
|
|||
|
# 提取页面布局的元素
|
|||
|
element = component[1]
|
|||
|
# 检查该元素是否为文本元素
|
|||
|
if isinstance(element, LTTextBoxHorizontal):
|
|||
|
# 检查文本是否出现在表中
|
|||
|
line_text = element.get_text().replace('\n','')
|
|||
|
line_text = re.sub(r"\s", "", line_text)
|
|||
|
|
|||
|
element_top = element.bbox[3]
|
|||
|
element_buttom = element.bbox[1]
|
|||
|
|
|||
|
# 检查该文本是否出现在表中
|
|||
|
if tables_range.get(pagenum+1):
|
|||
|
for range in tables_range[pagenum+1]:
|
|||
|
# print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
|
|||
|
if element_top < range['top'] and element_top > range['buttom']:
|
|||
|
pass
|
|||
|
else:
|
|||
|
if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
|
|||
|
if i == 0:
|
|||
|
text_type = get_text_type(line_text)
|
|||
|
if text_type == 'page_header':
|
|||
|
break
|
|||
|
if utils.check_table_title_black_list(line_text):
|
|||
|
print(line_text)
|
|||
|
|
|||
|
pdf_info.append({
|
|||
|
'top' : element_top,
|
|||
|
'buttom' : element_buttom,
|
|||
|
'page_num' : range['page_num'],
|
|||
|
'table_index' : range['table_index'],
|
|||
|
"type" : text_type,
|
|||
|
'content' : line_text,
|
|||
|
'sort_num' : range['page_num']*1000 - element_top
|
|||
|
})
|
|||
|
break
|
|||
|
#处理母公司表格标题在页面底部,完整表格在下一页
|
|||
|
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
|
|||
|
text_type = get_text_type(line_text)
|
|||
|
|
|||
|
if text_type == 'page_footer':
|
|||
|
continue
|
|||
|
|
|||
|
pdf_info.append({
|
|||
|
'top' : element_top,
|
|||
|
'buttom' : element_buttom,
|
|||
|
'page_num' : pagenum+1,
|
|||
|
"type" : text_type,
|
|||
|
'content' : line_text,
|
|||
|
'sort_num' : (pagenum+1)*1000 - element_top
|
|||
|
})
|
|||
|
# print(f'{element_top}: {element_buttom}: {line_text}')
|
|||
|
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
|||
|
for info in sorted_pdf_info:
|
|||
|
print(info)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
def text_in_table(top, tables_range, page_num):
|
|||
|
if tables_range.get(page_num):
|
|||
|
for range in tables_range[page_num]:
|
|||
|
if top < range['top'] and top > range['buttom']:
|
|||
|
return True
|
|||
|
return False
|
|||
|
|
|||
|
def get_text_type(text: str):
|
|||
|
first_re = '年度报告'
|
|||
|
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
|||
|
|
|||
|
if re.search(first_re, text.strip()):
|
|||
|
return 'page_header'
|
|||
|
|
|||
|
if page_number_pattern.match(text.strip()):
|
|||
|
return 'page_footer'
|
|||
|
|
|||
|
return 'text'
|
|||
|
|
|||
|
def find_continuous_numbers(numbers):
|
|||
|
# 首先对数组进行排序
|
|||
|
numbers.sort()
|
|||
|
|
|||
|
# 初始化新数组和连续序列的开始索引
|
|||
|
new_numbers = []
|
|||
|
start_index = 0
|
|||
|
|
|||
|
# 遍历排序后的数组
|
|||
|
for i in range(1, len(numbers)):
|
|||
|
# 检查当前数字是否与前一个数字不连续
|
|||
|
if numbers[i] != numbers[i-1] + 1:
|
|||
|
# 如果当前数字与前一个数字不连续,处理连续序列
|
|||
|
if i - start_index > 1:
|
|||
|
# 如果连续序列长度大于1,将最小数和最大数用"-"连接
|
|||
|
new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
|
|||
|
else:
|
|||
|
# 如果连续序列长度为1,直接添加数字
|
|||
|
new_numbers.append(str(numbers[start_index]))
|
|||
|
if start_index == i - 1:
|
|||
|
new_numbers.append(str(numbers[i-1]))
|
|||
|
|
|||
|
# 更新连续序列的开始索引
|
|||
|
start_index = i
|
|||
|
|
|||
|
# 处理数组末尾的连续序列
|
|||
|
if len(numbers) - start_index > 1:
|
|||
|
new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
|
|||
|
else:
|
|||
|
new_numbers.append(str(numbers[start_index]))
|
|||
|
if start_index < len(numbers) - 1:
|
|||
|
new_numbers.append(str(numbers[-1]))
|
|||
|
|
|||
|
return new_numbers
|
|||
|
|
|||
|
def merge_consecutive_arrays(file_path):
|
|||
|
merged_objects = []
|
|||
|
temp_array = {}
|
|||
|
|
|||
|
# 打开文件并读取每一行
|
|||
|
with open(file_path, 'r') as file:
|
|||
|
for line in file:
|
|||
|
# 去除行尾的换行符
|
|||
|
line = line.strip()
|
|||
|
# 尝试将行转换成JSON格式
|
|||
|
try:
|
|||
|
obj = eval(line)
|
|||
|
if obj['type'] == 'table':
|
|||
|
# 如果对象是数组,将其元素添加到临时列表中
|
|||
|
if not temp_array.get('page_num'):
|
|||
|
temp_array = obj
|
|||
|
else:
|
|||
|
temp_array['data'].extend(obj['data'])
|
|||
|
else:
|
|||
|
# 如果对象不是数组,检查临时列表是否为空
|
|||
|
if temp_array:
|
|||
|
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
|||
|
merged_objects.append(temp_array)
|
|||
|
temp_array = {} # 重置临时列表
|
|||
|
except json.JSONDecodeError as e:
|
|||
|
print(f"Error decoding JSON line: {e}")
|
|||
|
|
|||
|
if temp_array:
|
|||
|
merged_objects.append(temp_array)
|
|||
|
|
|||
|
# 关闭文件
|
|||
|
file.close()
|
|||
|
|
|||
|
return merged_objects
|
|||
|
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
# print(get_text_type('6/223 '.strip()))
|
|||
|
# start = time.time()
|
|||
|
get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
|
|||
|
# end = time.time()
|
|||
|
# print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
|
|||
|
# 示例数组
|
|||
|
# numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
|
|||
|
# # 调用函数并打印结果
|
|||
|
# print(find_continuous_numbers(numbers))
|
|||
|
# 示例数组对象列表
|
|||
|
# 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下:
|
|||
|
# df1 = tables[0].df
|
|||
|
# df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引
|
|||
|
# df3 = tables[1].df
|
|||
|
# df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0])
|
|||
|
# df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并,ignore_index=True,表根据列名对齐合并,生成新的index
|
|||
|
# print(df__2)
|
|||
|
|
|||
|
# 调用函数并打印结果
|
|||
|
# print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))
|