251 lines
9.6 KiB
Python
251 lines
9.6 KiB
Python
import camelot
|
||
import time
|
||
import re
|
||
import numpy as np
|
||
from pdfminer.high_level import extract_pages
|
||
from pdfminer.layout import LTTextBoxHorizontal
|
||
import pdfplumber
|
||
import json
|
||
import utils
|
||
|
||
def chunks(l, n):
|
||
"""Yield successive n-sized chunks from l."""
|
||
for i in range(0, len(l), n):
|
||
yield l[i : i + n]
|
||
|
||
|
||
def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
|
||
"""
|
||
Divide the extraction work into n chunks. At the end of every chunk,
|
||
save data on disk and free RAM.
|
||
|
||
filepath : str
|
||
Filepath or URL of the PDF file.
|
||
pages : str, optional (default: '1')
|
||
Comma-separated page numbers.
|
||
Example: '1,3,4' or '1,4-end' or 'all'.
|
||
"""
|
||
|
||
# get list of pages from camelot.handlers.PDFHandler
|
||
handler = camelot.handlers.PDFHandler(filepath)
|
||
page_list = handler._get_pages(pages=pages_num)
|
||
|
||
# chunk pages list
|
||
page_chunks = list(chunks(page_list, chunk_num))
|
||
|
||
# extraction and export
|
||
for chunk in page_chunks:
|
||
pages_string = str(chunk).replace("[", "").replace("]", "")
|
||
tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
|
||
tables.export(f"{export_path}/tables.csv")
|
||
|
||
# 读取pdf中的表格,并将表格中指标和表头合并,eg: 2022年1季度营业收入为xxxxx
|
||
def get_pdf_info(file_path, pages):
|
||
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
|
||
|
||
pdf_info = []
|
||
tables_range = {}
|
||
|
||
for table_num, t in enumerate(tables):
|
||
|
||
top = t._bbox[3]
|
||
buttom = t._bbox[1]
|
||
page_num = int(t.page)
|
||
table_index = int(t.order)
|
||
arr = np.array(t.data)
|
||
if not tables_range.get(page_num):
|
||
tables_range[page_num] = []
|
||
|
||
tables_range[page_num].append({
|
||
'top' : top,
|
||
'buttom' : buttom,
|
||
'table_index' : table_index,
|
||
'page_num' : page_num,
|
||
})
|
||
|
||
pdf_info.append({
|
||
'top' : top,
|
||
'buttom' : buttom,
|
||
'page_num' : page_num,
|
||
'table_index' : table_index,
|
||
"type" : "table",
|
||
"data" : t.data,
|
||
'sort_num' : page_num*1000 - top
|
||
})
|
||
|
||
for pagenum, page in enumerate(extract_pages(file_path)):
|
||
page_elements = [(element.y1, element) for element in page._objs]
|
||
# 查找组成页面的元素
|
||
for i,component in enumerate(page_elements):
|
||
|
||
text_type = 'text'
|
||
# 提取页面布局的元素
|
||
element = component[1]
|
||
# 检查该元素是否为文本元素
|
||
if isinstance(element, LTTextBoxHorizontal):
|
||
# 检查文本是否出现在表中
|
||
line_text = element.get_text().replace('\n','')
|
||
line_text = re.sub(r"\s", "", line_text)
|
||
|
||
element_top = element.bbox[3]
|
||
element_buttom = element.bbox[1]
|
||
|
||
# 检查该文本是否出现在表中
|
||
if tables_range.get(pagenum+1):
|
||
for range in tables_range[pagenum+1]:
|
||
# print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
|
||
if element_top < range['top'] and element_top > range['buttom']:
|
||
pass
|
||
else:
|
||
if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
|
||
if i == 0:
|
||
text_type = get_text_type(line_text)
|
||
if text_type == 'page_header':
|
||
break
|
||
if utils.check_table_title_black_list(line_text):
|
||
print(line_text)
|
||
|
||
pdf_info.append({
|
||
'top' : element_top,
|
||
'buttom' : element_buttom,
|
||
'page_num' : range['page_num'],
|
||
'table_index' : range['table_index'],
|
||
"type" : text_type,
|
||
'content' : line_text,
|
||
'sort_num' : range['page_num']*1000 - element_top
|
||
})
|
||
break
|
||
#处理母公司表格标题在页面底部,完整表格在下一页
|
||
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
|
||
text_type = get_text_type(line_text)
|
||
|
||
if text_type == 'page_footer':
|
||
continue
|
||
|
||
pdf_info.append({
|
||
'top' : element_top,
|
||
'buttom' : element_buttom,
|
||
'page_num' : pagenum+1,
|
||
"type" : text_type,
|
||
'content' : line_text,
|
||
'sort_num' : (pagenum+1)*1000 - element_top
|
||
})
|
||
# print(f'{element_top}: {element_buttom}: {line_text}')
|
||
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
||
for info in sorted_pdf_info:
|
||
print(info)
|
||
|
||
|
||
|
||
def text_in_table(top, tables_range, page_num):
|
||
if tables_range.get(page_num):
|
||
for range in tables_range[page_num]:
|
||
if top < range['top'] and top > range['buttom']:
|
||
return True
|
||
return False
|
||
|
||
def get_text_type(text: str):
|
||
first_re = '年度报告'
|
||
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
||
|
||
if re.search(first_re, text.strip()):
|
||
return 'page_header'
|
||
|
||
if page_number_pattern.match(text.strip()):
|
||
return 'page_footer'
|
||
|
||
return 'text'
|
||
|
||
def find_continuous_numbers(numbers):
|
||
# 首先对数组进行排序
|
||
numbers.sort()
|
||
|
||
# 初始化新数组和连续序列的开始索引
|
||
new_numbers = []
|
||
start_index = 0
|
||
|
||
# 遍历排序后的数组
|
||
for i in range(1, len(numbers)):
|
||
# 检查当前数字是否与前一个数字不连续
|
||
if numbers[i] != numbers[i-1] + 1:
|
||
# 如果当前数字与前一个数字不连续,处理连续序列
|
||
if i - start_index > 1:
|
||
# 如果连续序列长度大于1,将最小数和最大数用"-"连接
|
||
new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
|
||
else:
|
||
# 如果连续序列长度为1,直接添加数字
|
||
new_numbers.append(str(numbers[start_index]))
|
||
if start_index == i - 1:
|
||
new_numbers.append(str(numbers[i-1]))
|
||
|
||
# 更新连续序列的开始索引
|
||
start_index = i
|
||
|
||
# 处理数组末尾的连续序列
|
||
if len(numbers) - start_index > 1:
|
||
new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
|
||
else:
|
||
new_numbers.append(str(numbers[start_index]))
|
||
if start_index < len(numbers) - 1:
|
||
new_numbers.append(str(numbers[-1]))
|
||
|
||
return new_numbers
|
||
|
||
def merge_consecutive_arrays(file_path):
|
||
merged_objects = []
|
||
temp_array = {}
|
||
|
||
# 打开文件并读取每一行
|
||
with open(file_path, 'r') as file:
|
||
for line in file:
|
||
# 去除行尾的换行符
|
||
line = line.strip()
|
||
# 尝试将行转换成JSON格式
|
||
try:
|
||
obj = eval(line)
|
||
if obj['type'] == 'table':
|
||
# 如果对象是数组,将其元素添加到临时列表中
|
||
if not temp_array.get('page_num'):
|
||
temp_array = obj
|
||
else:
|
||
temp_array['data'].extend(obj['data'])
|
||
else:
|
||
# 如果对象不是数组,检查临时列表是否为空
|
||
if temp_array:
|
||
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
|
||
merged_objects.append(temp_array)
|
||
temp_array = {} # 重置临时列表
|
||
except json.JSONDecodeError as e:
|
||
print(f"Error decoding JSON line: {e}")
|
||
|
||
if temp_array:
|
||
merged_objects.append(temp_array)
|
||
|
||
# 关闭文件
|
||
file.close()
|
||
|
||
return merged_objects
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# print(get_text_type('6/223 '.strip()))
|
||
# start = time.time()
|
||
get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
|
||
# end = time.time()
|
||
# print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
|
||
# 示例数组
|
||
# numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
|
||
# # 调用函数并打印结果
|
||
# print(find_continuous_numbers(numbers))
|
||
# 示例数组对象列表
|
||
# 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下:
|
||
# df1 = tables[0].df
|
||
# df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引
|
||
# df3 = tables[1].df
|
||
# df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0])
|
||
# df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并,ignore_index=True,表根据列名对齐合并,生成新的index
|
||
# print(df__2)
|
||
|
||
# 调用函数并打印结果
|
||
# print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt')) |