pdf_code/zzb_data_word/camelot_tables.py

251 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import camelot
import time
import re
import numpy as np
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import json
import utils
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i : i + n]
def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
"""
Divide the extraction work into n chunks. At the end of every chunk,
save data on disk and free RAM.
filepath : str
Filepath or URL of the PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
"""
# get list of pages from camelot.handlers.PDFHandler
handler = camelot.handlers.PDFHandler(filepath)
page_list = handler._get_pages(pages=pages_num)
# chunk pages list
page_chunks = list(chunks(page_list, chunk_num))
# extraction and export
for chunk in page_chunks:
pages_string = str(chunk).replace("[", "").replace("]", "")
tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
tables.export(f"{export_path}/tables.csv")
# 读取pdf中的表格,并将表格中指标和表头合并eg: 2022年1季度营业收入为xxxxx
def get_pdf_info(file_path, pages):
tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
pdf_info = []
tables_range = {}
for table_num, t in enumerate(tables):
top = t._bbox[3]
buttom = t._bbox[1]
page_num = int(t.page)
table_index = int(t.order)
arr = np.array(t.data)
if not tables_range.get(page_num):
tables_range[page_num] = []
tables_range[page_num].append({
'top' : top,
'buttom' : buttom,
'table_index' : table_index,
'page_num' : page_num,
})
pdf_info.append({
'top' : top,
'buttom' : buttom,
'page_num' : page_num,
'table_index' : table_index,
"type" : "table",
"data" : t.data,
'sort_num' : page_num*1000 - top
})
for pagenum, page in enumerate(extract_pages(file_path)):
page_elements = [(element.y1, element) for element in page._objs]
# 查找组成页面的元素
for i,component in enumerate(page_elements):
text_type = 'text'
# 提取页面布局的元素
element = component[1]
# 检查该元素是否为文本元素
if isinstance(element, LTTextBoxHorizontal):
# 检查文本是否出现在表中
line_text = element.get_text().replace('\n','')
line_text = re.sub(r"\s", "", line_text)
element_top = element.bbox[3]
element_buttom = element.bbox[1]
# 检查该文本是否出现在表中
if tables_range.get(pagenum+1):
for range in tables_range[pagenum+1]:
# print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
if element_top < range['top'] and element_top > range['buttom']:
pass
else:
if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
if i == 0:
text_type = get_text_type(line_text)
if text_type == 'page_header':
break
if utils.check_table_title_black_list(line_text):
print(line_text)
pdf_info.append({
'top' : element_top,
'buttom' : element_buttom,
'page_num' : range['page_num'],
'table_index' : range['table_index'],
"type" : text_type,
'content' : line_text,
'sort_num' : range['page_num']*1000 - element_top
})
break
#处理母公司表格标题在页面底部,完整表格在下一页
if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
text_type = get_text_type(line_text)
if text_type == 'page_footer':
continue
pdf_info.append({
'top' : element_top,
'buttom' : element_buttom,
'page_num' : pagenum+1,
"type" : text_type,
'content' : line_text,
'sort_num' : (pagenum+1)*1000 - element_top
})
# print(f'{element_top}: {element_buttom}: {line_text}')
sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
for info in sorted_pdf_info:
print(info)
def text_in_table(top, tables_range, page_num):
if tables_range.get(page_num):
for range in tables_range[page_num]:
if top < range['top'] and top > range['buttom']:
return True
return False
def get_text_type(text: str):
first_re = '年度报告'
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
if re.search(first_re, text.strip()):
return 'page_header'
if page_number_pattern.match(text.strip()):
return 'page_footer'
return 'text'
def find_continuous_numbers(numbers):
# 首先对数组进行排序
numbers.sort()
# 初始化新数组和连续序列的开始索引
new_numbers = []
start_index = 0
# 遍历排序后的数组
for i in range(1, len(numbers)):
# 检查当前数字是否与前一个数字不连续
if numbers[i] != numbers[i-1] + 1:
# 如果当前数字与前一个数字不连续,处理连续序列
if i - start_index > 1:
# 如果连续序列长度大于1将最小数和最大数用"-"连接
new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
else:
# 如果连续序列长度为1直接添加数字
new_numbers.append(str(numbers[start_index]))
if start_index == i - 1:
new_numbers.append(str(numbers[i-1]))
# 更新连续序列的开始索引
start_index = i
# 处理数组末尾的连续序列
if len(numbers) - start_index > 1:
new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
else:
new_numbers.append(str(numbers[start_index]))
if start_index < len(numbers) - 1:
new_numbers.append(str(numbers[-1]))
return new_numbers
def merge_consecutive_arrays(file_path):
merged_objects = []
temp_array = {}
# 打开文件并读取每一行
with open(file_path, 'r') as file:
for line in file:
# 去除行尾的换行符
line = line.strip()
# 尝试将行转换成JSON格式
try:
obj = eval(line)
if obj['type'] == 'table':
# 如果对象是数组,将其元素添加到临时列表中
if not temp_array.get('page_num'):
temp_array = obj
else:
temp_array['data'].extend(obj['data'])
else:
# 如果对象不是数组,检查临时列表是否为空
if temp_array:
# 将临时列表中的元素合并成一个数组,并添加到新的对象列表中
merged_objects.append(temp_array)
temp_array = {} # 重置临时列表
except json.JSONDecodeError as e:
print(f"Error decoding JSON line: {e}")
if temp_array:
merged_objects.append(temp_array)
# 关闭文件
file.close()
return merged_objects
if __name__ == "__main__":
# print(get_text_type('6/223 '.strip()))
# start = time.time()
get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
# end = time.time()
# print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
# 示例数组
# numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
# # 调用函数并打印结果
# print(find_continuous_numbers(numbers))
# 示例数组对象列表
# 分别对两个表格进行列命名和索引指定等操作,最后将两个表格进行合并,执行代码如下:
# df1 = tables[0].df
# df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0]) ##将第0行作为行索引
# df3 = tables[1].df
# df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0])
# df__2= df2.append(df4,ignore_index=True) ##将两个数据进行合并ignore_index=True,表根据列名对齐合并生成新的index
# print(df__2)
# 调用函数并打印结果
# print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))