220 lines
8.7 KiB
Python
220 lines
8.7 KiB
Python
import camelot
|
||
import os
|
||
import tempfile
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
#from camelot.plotting import plot_contour
|
||
|
||
#单独针对三季报的资产负债表识别合并问题
|
||
import re
|
||
|
||
def process_array(arr, years=['2022', '2023', '2024'], keyword='项目'):
|
||
# 确保 row 有足够的列来存储分割后的数据
|
||
def ensure_columns(row, num_columns):
|
||
while len(row) < num_columns:
|
||
row.append('')
|
||
|
||
def is_valid_header(header, years, keyword):
|
||
header_text = header.lower() # 转小写以提高匹配的鲁棒性
|
||
return any(year in header_text for year in years) and keyword in header_text
|
||
|
||
# 对字符串进行清理
|
||
def clean_text(text):
|
||
# 去除“年”和“月”相邻的空格
|
||
text = re.sub(r'\s*(年|月)\s*', r'\1', text)
|
||
# 去除“日”左侧相邻的空格
|
||
text = re.sub(r'\s*日', '日', text)
|
||
return text
|
||
|
||
# 将 numpy 数组转换为列表
|
||
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
|
||
|
||
if len(arr[0]) == 1 and is_valid_header(arr[0][0], years, keyword):
|
||
remaining_value = arr[0][0]
|
||
|
||
# 清理字符串
|
||
remaining_value = clean_text(remaining_value)
|
||
|
||
parts = remaining_value.split()
|
||
|
||
ensure_columns(arr[0], len(parts))
|
||
for i in range(len(parts)):
|
||
arr[0][i] = parts[i]
|
||
|
||
header_columns = len(arr[0])
|
||
|
||
for i in range(1, len(arr)):
|
||
if len(arr[i]) == 1:
|
||
remaining_value = arr[i][0]
|
||
parts = remaining_value.split()
|
||
if len(parts) > header_columns:
|
||
parts = parts[:header_columns]
|
||
ensure_columns(arr[i], header_columns)
|
||
for j in range(len(parts)):
|
||
arr[i][j] = parts[j]
|
||
# 如果分割出的值不足,填充空值
|
||
if len(parts) < header_columns:
|
||
for j in range(len(parts), header_columns):
|
||
arr[i][j] = ''
|
||
|
||
return arr
|
||
|
||
|
||
|
||
|
||
def process_array_with_annual_comparison(arr, keywords=['本报告期', '年初至报告期末', '上年同期']):
|
||
def contains_all_keywords(header, keywords):
|
||
return all(keyword in header for keyword in keywords)
|
||
|
||
def split_and_replace_occurrences(header, target, replacement):
|
||
# 找到所有 target 出现的位置
|
||
indices = [i for i, x in enumerate(header) if x == target]
|
||
if len(indices) > 1:
|
||
split_index = len(indices) // 2
|
||
for i in range(split_index):
|
||
header[indices[i]] = replacement
|
||
return header
|
||
|
||
# 将 numpy 数组转换为列表
|
||
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
|
||
|
||
if len(arr) > 0 and len(arr[0]) > 0:
|
||
first_row = arr[0]
|
||
|
||
if contains_all_keywords(first_row, keywords):
|
||
# 将 "上年同期" 拆分并替换
|
||
first_row = split_and_replace_occurrences(first_row, '上年同期', '三季报中无需识别的上年同期')
|
||
arr[0] = first_row
|
||
|
||
return arr
|
||
|
||
def process_array_with_grants(arr, keywords=['本报告期', '年初至报告期'], target='计入当期损益的政府补助', replacement='非经常性损益'):
|
||
# 检查第一行是否包含所有关键词
|
||
def contains_all_keywords(header, keywords):
|
||
#return all(keyword in header for keyword in keywords)
|
||
return all(any(keyword in str(cell) for cell in header) for keyword in keywords)
|
||
|
||
|
||
# 检查第一列中是否存在目标文本
|
||
def contains_target_in_first_column(arr, target):
|
||
return any(target in str(item[0]) for item in arr)
|
||
|
||
# 替换第一列中的特定值
|
||
def replace_in_first_column(arr, target, replacement):
|
||
for i in range(len(arr)):
|
||
if arr[i][0] == target:
|
||
arr[i][0] = replacement
|
||
return arr
|
||
|
||
# 将 numpy 数组转换为列表
|
||
arr = arr.tolist() if isinstance(arr, np.ndarray) else arr
|
||
|
||
if len(arr) > 0 and len(arr[0]) > 0:
|
||
first_row = arr[0]
|
||
|
||
# 检查第一行和第一列的条件
|
||
if contains_all_keywords(first_row, keywords) and contains_target_in_first_column(arr, target):
|
||
# 替换第一列中的 "合计"
|
||
arr = replace_in_first_column(arr, '合计', replacement)
|
||
|
||
return arr
|
||
temp_dir_path = "F:\\temp"
|
||
|
||
# 检查并创建临时文件夹
|
||
if not os.path.exists(temp_dir_path):
|
||
os.makedirs(temp_dir_path)
|
||
#file_path = "F:\\11_pdf\\603636-2024-0630-0803.pdf"#"F:\11_pdf\603636-2024-0630-0803.PDF"
|
||
#file_path = r"C:\Users\钱程\Downloads\天德钰:深圳天德钰科技股份有限公司2024年半年度报告.PDF"
|
||
#file_path = r"C:\Users\钱程\Downloads\600239-2023-nb-nb.pdf"
|
||
file_path = r"C:\Users\钱程\Downloads\航发科技:中国航发航空科技股份有限公司2023年第三季度报告.PDF"
|
||
# 创建临时文件夹
|
||
temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
|
||
# 设置全局临时文件夹路径
|
||
os.environ["TMP"] = temp_dir
|
||
os.environ["TEMP"] = temp_dir
|
||
# try:
|
||
# tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
|
||
# print('读取成功')
|
||
# except Exception as e:
|
||
# print(f'错误在{e}')
|
||
#print(f'file_path的值是{file_path}')
|
||
#file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
|
||
os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"
|
||
|
||
# 确保 file_path 是正确的,并且文件是可访问的
|
||
if not os.path.exists(file_path):
|
||
print(f'文件路径不正确或文件不存在: {file_path}')
|
||
raise FileNotFoundError(f"文件不存在:{file_path}")
|
||
else:
|
||
pass#(f'file_path是存在的就是{file_path}')
|
||
|
||
# 读取 PDF 文件
|
||
#tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
|
||
#tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
|
||
#tables = camelot.read_pdf(file_path,strip_text=' ,\n',pages = '1-1', copy_text=['h','v'], temp_dir=temp_dir,shift_text = [''])#line_scale=10,
|
||
#tables = camelot.read_pdf(file_path, pages='1-1', strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)
|
||
|
||
#tables = camelot.read_pdf(file_path, pages='1-1', strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)
|
||
#tables = camelot.read_pdf(file_path, pages='1-1', strip_text=' ,\n', copy_text=['v','h'], temp_dir=temp_dir,shift_text = [''])
|
||
#54
|
||
tables = camelot.read_pdf(file_path, pages='1-1', strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
|
||
#camelot.plot(tables[0], kind='grid').savefig('contour_plot.png')
|
||
# for t in tables:
|
||
|
||
# top = t._bbox[3]
|
||
# buttom = t._bbox[1]
|
||
# page_num = int(t.page)
|
||
# table_index = int(t.order)
|
||
# arr = np.array(t.data)
|
||
#print(arr)
|
||
for t in tables:
|
||
|
||
top = t._bbox[3]
|
||
buttom = t._bbox[1]
|
||
page_num = int(t.page)
|
||
table_index = int(t.order)
|
||
arr = np.array(t.data)
|
||
print('=======')
|
||
print(arr)
|
||
arr = process_array_with_annual_comparison(arr)
|
||
arr = process_array_with_grants(arr)
|
||
if len(arr[0]) == 4 and all(value == arr[0][0] for value in arr[0]) and all("项目" in arr[0][0] and "附注" in arr[0][0] for value in arr[0]):
|
||
initial_value = arr[0][0].replace(' ','')
|
||
project_value = "项目"
|
||
note_value = "附注"
|
||
remaining_value = initial_value.replace("项目", "", 1).replace("附注", "", 1)
|
||
split_index = len(remaining_value) // 2
|
||
first_half = remaining_value[:split_index]
|
||
second_half = remaining_value[split_index:]
|
||
# 判断 "项目" 在 original_value 中的位置
|
||
if "项目" in initial_value and first_half in initial_value and second_half in initial_value :
|
||
project_index = initial_value.index("项目")
|
||
year_index = initial_value.index(first_half)
|
||
year_index_2 = initial_value.index(second_half)
|
||
print('条件满足')
|
||
|
||
# 判断 "项目" 是否在 first_half 的前面
|
||
if project_index > year_index and project_index < year_index_2:
|
||
first_half, second_half = second_half, first_half
|
||
arr[0] = [project_value, note_value, first_half, second_half]
|
||
print(arr)
|
||
tables.export('foo.csv', f='csv', compress=True) # json, excel, html, markdown, sqlite
|
||
a=0
|
||
tables[a].parsing_report
|
||
tables[a].to_csv('foo.csv') # to_json, to_excel, to_html, to_markdown, to_sqlite
|
||
tables[a].df # get a pandas DataFrame!
|
||
|
||
import pandas as pd
|
||
|
||
# 定义 CSV 文件的路径
|
||
csv_file_path = 'foo.csv' # 替换为你的 CSV 文件路径
|
||
excel_file_path = 'foodata.xlsx' # 定义输出的 Excel 文件路径
|
||
|
||
# 读取 CSV 文件
|
||
df = pd.read_csv(csv_file_path)
|
||
|
||
# 将数据写入 Excel 文件
|
||
df.to_excel(excel_file_path, index=False)
|
||
|
||
print(f"CSV 文件已成功转换为 Excel 文件:{excel_file_path}")
|