pdf_code/zzb_data_word/camelot_tables.py

import camelot
import time
import re
import numpy as np
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import json
import utils

def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i : i + n]


def extract_tables(filepath, pages_num, chunk_num=50, export_path=".", params={}):
    """
    Divide the extraction work into n chunks. At the end of every chunk,
    save data on disk and free RAM.

    filepath : str
        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.
    """

    # get list of pages from camelot.handlers.PDFHandler
    handler = camelot.handlers.PDFHandler(filepath)
    page_list = handler._get_pages(pages=pages_num)

    # chunk pages list
    page_chunks = list(chunks(page_list, chunk_num))

    # extraction and export
    for chunk in page_chunks:
        pages_string = str(chunk).replace("[", "").replace("]", "")
        tables = camelot.read_pdf(filepath, pages=pages_string, strip_text=' ,\n', copy_text=['h'])
        tables.export(f"{export_path}/tables.csv")

# 读取pdf中的表格,并将表格中指标和表头合并，eg: 2022年1季度营业收入为xxxxx
def get_pdf_info(file_path, pages):
    tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
    
    pdf_info = []
    tables_range = {}
    
    for table_num, t in enumerate(tables):
        
        top = t._bbox[3]
        buttom = t._bbox[1]
        page_num = int(t.page)
        table_index = int(t.order)
        arr = np.array(t.data)
        if not tables_range.get(page_num):
            tables_range[page_num] = []
        
        tables_range[page_num].append({
            'top' : top,
            'buttom' : buttom,
            'table_index' : table_index,
            'page_num' : page_num,
        })

        pdf_info.append({
            'top' : top,
            'buttom' : buttom,
            'page_num' : page_num,
            'table_index' : table_index,
            "type" : "table",
            "data" : t.data,
            'sort_num' : page_num*1000 - top
        })

    for pagenum, page in enumerate(extract_pages(file_path)):
        page_elements = [(element.y1, element) for element in page._objs]
        # 查找组成页面的元素
        for i,component in enumerate(page_elements):

            text_type = 'text'
            # 提取页面布局的元素
            element = component[1]
            # 检查该元素是否为文本元素
            if isinstance(element, LTTextBoxHorizontal):
                # 检查文本是否出现在表中
                line_text = element.get_text().replace('\n','')
                line_text = re.sub(r"\s", "", line_text)
                
                element_top = element.bbox[3]
                element_buttom = element.bbox[1]
                
                # 检查该文本是否出现在表中
                if tables_range.get(pagenum+1):
                    for range in tables_range[pagenum+1]:
                        # print(f"{range['top']}: {range['buttom']}: {range['table_index']}")
                        if element_top < range['top'] and element_top > range['buttom']:
                            pass
                        else:
                            if element_top - range['top'] < 100 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
                                if i == 0:
                                    text_type = get_text_type(line_text)
                                    if text_type == 'page_header':
                                        break
                                if utils.check_table_title_black_list(line_text):
                                    print(line_text)
                                    
                                pdf_info.append({
                                    'top' : element_top,
                                    'buttom' : element_buttom,
                                    'page_num' : range['page_num'],
                                    'table_index' : range['table_index'],
                                    "type" : text_type,
                                    'content' : line_text,
                                    'sort_num' : range['page_num']*1000 - element_top
                                })
                                break
                #处理母公司表格标题在页面底部，完整表格在下一页
                if element_buttom < 150 and not text_in_table(element_top, tables_range, pagenum+1):
                    text_type = get_text_type(line_text)

                    if text_type == 'page_footer':
                        continue
                        
                    pdf_info.append({
                        'top' : element_top,
                        'buttom' : element_buttom,
                        'page_num' : pagenum+1,
                        "type" : text_type,
                        'content' : line_text,
                        'sort_num' : (pagenum+1)*1000 - element_top
                    })
                    # print(f'{element_top}: {element_buttom}: {line_text}')
    sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
    for info in sorted_pdf_info:
        print(info)
    

def text_in_table(top, tables_range, page_num):
    if tables_range.get(page_num):
        for range in tables_range[page_num]:
            if top < range['top'] and top > range['buttom']:
                return True
    return False

def get_text_type(text: str):
    first_re = '年度报告'
    page_number_pattern = re.compile(r'^\d+(/\d+)?$')
    
    if re.search(first_re, text.strip()):
         return 'page_header'
    
    if page_number_pattern.match(text.strip()):
        return 'page_footer'
    
    return 'text'

def find_continuous_numbers(numbers):
    # 首先对数组进行排序
    numbers.sort()
    
    # 初始化新数组和连续序列的开始索引
    new_numbers = []
    start_index = 0
    
    # 遍历排序后的数组
    for i in range(1, len(numbers)):
        # 检查当前数字是否与前一个数字不连续
        if numbers[i] != numbers[i-1] + 1:
            # 如果当前数字与前一个数字不连续，处理连续序列
            if i - start_index > 1:
                # 如果连续序列长度大于1，将最小数和最大数用"-"连接
                new_numbers.append(f"{numbers[start_index]}-{numbers[i-1]}")
            else:
                # 如果连续序列长度为1，直接添加数字
                new_numbers.append(str(numbers[start_index]))
                if start_index == i - 1:
                    new_numbers.append(str(numbers[i-1]))
            
            # 更新连续序列的开始索引
            start_index = i
    
    # 处理数组末尾的连续序列
    if len(numbers) - start_index > 1:
        new_numbers.append(f"{numbers[start_index]}-{numbers[-1]}")
    else:
        new_numbers.append(str(numbers[start_index]))
        if start_index < len(numbers) - 1:
            new_numbers.append(str(numbers[-1]))
    
    return new_numbers

def merge_consecutive_arrays(file_path):
    merged_objects = []
    temp_array = {}

    # 打开文件并读取每一行
    with open(file_path, 'r') as file:
        for line in file:
            # 去除行尾的换行符
            line = line.strip()
            # 尝试将行转换成JSON格式
            try:
                obj = eval(line)
                if obj['type'] == 'table':
                    # 如果对象是数组，将其元素添加到临时列表中
                    if not temp_array.get('page_num'):
                        temp_array = obj
                    else:
                        temp_array['data'].extend(obj['data'])
                else:
                    # 如果对象不是数组，检查临时列表是否为空
                    if temp_array:
                        # 将临时列表中的元素合并成一个数组，并添加到新的对象列表中
                        merged_objects.append(temp_array)
                        temp_array = {}  # 重置临时列表
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {e}")

        if temp_array:
            merged_objects.append(temp_array)

    # 关闭文件
    file.close()
    
    return merged_objects


if __name__ == "__main__":
    # print(get_text_type('6/223 '.strip()))
    # start = time.time()
    get_pdf_info('/Users/zhengfei/Desktop/0609/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99/zhangjun-430489-2023-nb-nb_2824bc6302e9442285aed64eed760d99_1.pdf','all')
    # end = time.time()
    # print('Task %s runs %0.2f seconds.' % ('223', (end - start)))
    # 示例数组
    # numbers = [1, 2, 3, 5, 7, 9, 10, 12, 13, 14, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, 29, 30, 32, 33, 34, 36, 37, 38, 39]
    # # 调用函数并打印结果
    # print(find_continuous_numbers(numbers))
    # 示例数组对象列表
    # 分别对两个表格进行列命名和索引指定等操作，最后将两个表格进行合并，执行代码如下：
    # df1 = tables[0].df
    # df2 = df1.rename(columns=df1.iloc[0]).drop(df1.index[0])     ##将第0行作为行索引
    # df3 = tables[1].df
    # df4 = df3.rename(columns=df3.iloc[0]).drop(df3.index[0]) 
    # df__2= df2.append(df4,ignore_index=True)  ##将两个数据进行合并，ignore_index=True,表根据列名对齐合并，生成新的index
    # print(df__2)

    # 调用函数并打印结果
    # print(merge_consecutive_arrays('/Users/zhengfei/work/zzb_data/tables.txt'))