pdf_code/zzb_data/test.py

import camelot
import os
import tempfile
import matplotlib.pyplot as plt
import numpy as np
#from camelot.plotting import plot_contour

#单独针对三季报的资产负债表识别合并问题
import re

def process_array(arr, years=['2022', '2023', '2024'], keyword='项目'):
    # 确保 row 有足够的列来存储分割后的数据
    def ensure_columns(row, num_columns):
        while len(row) < num_columns:
            row.append('')

    def is_valid_header(header, years, keyword):
        header_text = header.lower()  # 转小写以提高匹配的鲁棒性
        return any(year in header_text for year in years) and keyword in header_text

    # 对字符串进行清理
    def clean_text(text):
        # 去除“年”和“月”相邻的空格
        text = re.sub(r'\s*(年|月)\s*', r'\1', text)
        # 去除“日”左侧相邻的空格
        text = re.sub(r'\s*日', '日', text)
        return text

    # 将 numpy 数组转换为列表
    arr = arr.tolist() if isinstance(arr, np.ndarray) else arr

    if len(arr[0]) == 1 and is_valid_header(arr[0][0], years, keyword):
        remaining_value = arr[0][0]

        # 清理字符串
        remaining_value = clean_text(remaining_value)

        parts = remaining_value.split()

        ensure_columns(arr[0], len(parts))
        for i in range(len(parts)):
            arr[0][i] = parts[i]

    header_columns = len(arr[0])

    for i in range(1, len(arr)):
        if len(arr[i]) == 1:
            remaining_value = arr[i][0]
            parts = remaining_value.split()
            if len(parts) > header_columns:
                parts = parts[:header_columns]
            ensure_columns(arr[i], header_columns)
            for j in range(len(parts)):
                arr[i][j] = parts[j]
            # 如果分割出的值不足，填充空值
            if len(parts) < header_columns:
                for j in range(len(parts), header_columns):
                    arr[i][j] = ''

    return arr


def process_array_with_annual_comparison(arr, keywords=['本报告期', '年初至报告期末', '上年同期']):
    def contains_all_keywords(header, keywords):
        return all(keyword in header for keyword in keywords)

    def split_and_replace_occurrences(header, target, replacement):
        # 找到所有 target 出现的位置
        indices = [i for i, x in enumerate(header) if x == target]
        if len(indices) > 1:
            split_index = len(indices) // 2
            for i in range(split_index):
                header[indices[i]] = replacement
        return header

    # 将 numpy 数组转换为列表
    arr = arr.tolist() if isinstance(arr, np.ndarray) else arr

    if len(arr) > 0 and len(arr[0]) > 0:
        first_row = arr[0]

        if contains_all_keywords(first_row, keywords):
            # 将 "上年同期" 拆分并替换
            first_row = split_and_replace_occurrences(first_row, '上年同期', '三季报中无需识别的上年同期')
            arr[0] = first_row

    return arr

def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期'], target='计入当期损益的政府补助', replacement='非经常性损益'):
    # 检查第一行是否包含所有关键词
    def contains_all_keywords(header, keywords):
        #return all(keyword in header for keyword in keywords)
        return all(any(keyword in str(cell) for cell in header) for keyword in keywords)


    # 检查第一列中是否存在目标文本
    def contains_target_in_first_column(arr, target):
        return any(target in str(item[0]) for item in arr)

    # 替换第一列中的特定值
    def replace_in_first_column(arr, target, replacement):
        for i in range(len(arr)):
            if arr[i][0] == target:
                arr[i][0] = replacement
        return arr

    # 将 numpy 数组转换为列表
    arr = arr.tolist() if isinstance(arr, np.ndarray) else arr

    if len(arr) > 0 and len(arr[0]) > 0:
        first_row = arr[0]

        # 检查第一行和第一列的条件
        if contains_all_keywords(first_row, keywords) and contains_target_in_first_column(arr, target):
            # 替换第一列中的 "合计"
            arr = replace_in_first_column(arr, '合计', replacement)

    return arr
temp_dir_path = "F:\\temp"

# 检查并创建临时文件夹
if not os.path.exists(temp_dir_path):
    os.makedirs(temp_dir_path)
#file_path = "F:\\11_pdf\\603636-2024-0630-0803.pdf"#"F:\11_pdf\603636-2024-0630-0803.PDF"
#file_path = r"C:\Users\钱程\Downloads\天德钰：深圳天德钰科技股份有限公司2024年半年度报告.PDF"
#file_path = r"C:\Users\钱程\Downloads\600239-2023-nb-nb.pdf"
file_path = r"C:\Users\钱程\Downloads\航发科技：中国航发航空科技股份有限公司2023年第三季度报告.PDF"
# 创建临时文件夹
temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
# 设置全局临时文件夹路径
os.environ["TMP"] = temp_dir
os.environ["TEMP"] = temp_dir
# try:
#     tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
#     print('读取成功')
# except Exception as e:
#     print(f'错误在{e}')
#print(f'file_path的值是{file_path}')
#file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"

# 确保 file_path 是正确的，并且文件是可访问的
if not os.path.exists(file_path):
    print(f'文件路径不正确或文件不存在: {file_path}')
    raise FileNotFoundError(f"文件不存在：{file_path}")
else:
    pass#(f'file_path是存在的就是{file_path}')

# 读取 PDF 文件
#tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
#tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
#tables = camelot.read_pdf(file_path,strip_text=' ,\n',pages = '1-1', copy_text=['h','v'], temp_dir=temp_dir,shift_text = [''])#line_scale=10,
#tables = camelot.read_pdf(file_path, pages='1-1', strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)

#tables = camelot.read_pdf(file_path, pages='1-1', strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)
#tables = camelot.read_pdf(file_path, pages='1-1', strip_text=' ,\n', copy_text=['v','h'], temp_dir=temp_dir,shift_text = [''])
#54
tables = camelot.read_pdf(file_path, pages='1-1', strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
#camelot.plot(tables[0], kind='grid').savefig('contour_plot.png')
# for t in tables:

#     top = t._bbox[3]
#     buttom = t._bbox[1]
#     page_num = int(t.page)
#     table_index = int(t.order)
#     arr = np.array(t.data)
#print(arr)
for t in tables:

    top = t._bbox[3]
    buttom = t._bbox[1]
    page_num = int(t.page)
    table_index = int(t.order)
    arr = np.array(t.data)
    print('=======')
    print(arr)
    arr = process_array_with_annual_comparison(arr)
    arr = process_array_with_grants(arr)
    if len(arr[0]) == 4 and all(value == arr[0][0] for value in arr[0]) and all("项目" in arr[0][0] and "附注" in arr[0][0] for value in arr[0]):
                initial_value = arr[0][0].replace(' ','')
                project_value = "项目"
                note_value = "附注"
                remaining_value = initial_value.replace("项目", "", 1).replace("附注", "", 1)
                split_index = len(remaining_value) // 2
                first_half = remaining_value[:split_index]
                second_half = remaining_value[split_index:]
             # 判断 "项目" 在 original_value 中的位置
                if "项目" in initial_value and first_half in initial_value and second_half in initial_value  :
                    project_index = initial_value.index("项目")
                    year_index = initial_value.index(first_half)
                    year_index_2 = initial_value.index(second_half)
                    print('条件满足')

            # 判断 "项目" 是否在 first_half 的前面
                    if project_index > year_index and project_index < year_index_2:
                        first_half, second_half = second_half, first_half
                arr[0] = [project_value, note_value, first_half, second_half]
    print(arr)
tables.export('foo.csv', f='csv', compress=True) # json, excel, html, markdown, sqlite
a=0
tables[a].parsing_report
tables[a].to_csv('foo.csv') # to_json, to_excel, to_html, to_markdown, to_sqlite
tables[a].df # get a pandas DataFrame!

import pandas as pd

# 定义 CSV 文件的路径
csv_file_path = 'foo.csv'  # 替换为你的 CSV 文件路径
excel_file_path = 'foodata.xlsx'  # 定义输出的 Excel 文件路径

# 读取 CSV 文件
df = pd.read_csv(csv_file_path)

# 将数据写入 Excel 文件
df.to_excel(excel_file_path, index=False)

print(f"CSV 文件已成功转换为 Excel 文件：{excel_file_path}")