import camelot import re #from multiprocessing import Pool import os, time, random import json #from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT from datetime import datetime # 读取PDF import PyPDF2 # 分析PDF的layout,提取文本 from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextBoxHorizontal import pdfplumber import mysql.connector #import utils from pymilvus import MilvusClient #import llm_service #import db_service #import pdf_title import numpy as np #from multiprocessing import Process import logging logger = logging.getLogger(__name__) STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入' #负责表内一旦出现某个字符,整个表丢弃 PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类' #unit_pattern = re.compile(r'单位[:|:]?(百万元|千万元|亿元|万元|千元|元)') MUILT_PATTERN = '调整前' file_path = r"combined_v61.pdf" file_id = 1 pages = '1-2' tables_range = {} # def get_table_range_test(file_path, file_id, pages, tables_range): # print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid())) # #(f'file_path: {file_path},file_id:{file_id},pages:{pages},tables_range:{tables_range}') # start = time.time() # import tempfile # temp_dir_path = "F:\\temp" # # 检查并创建临时文件夹 # if not os.path.exists(temp_dir_path): # os.makedirs(temp_dir_path) # # 创建临时文件夹 # temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path) # # 设置全局临时文件夹路径 # os.environ["TMP"] = temp_dir # os.environ["TEMP"] = temp_dir # # conn = mysql.connector.connect( # # host= MYSQL_HOST, # # user= MYSQL_USER, # # password= MYSQL_PASSWORD, # # database= MYSQL_DB # # ) # # 创建一个cursor对象来执行SQL语句 # #print(f'file_path的值是{file_path}') # #cursor = conn.cursor() # # try: # # tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h']) # # print('读取成功') # # except Exception as e: # # print(f'错误在{e}') # #print(f'file_path的值是{file_path}') # #file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf" # os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c" # try: # # 确保 file_path 是正确的,并且文件是可访问的 # if not os.path.exists(file_path): # print(f'文件路径不正确或文件不存在: {file_path}') # raise FileNotFoundError(f"文件不存在:{file_path}") # else: # pass#(f'file_path是存在的就是{file_path}') # # 读取 PDF 文件 # #tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h'] # #tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir) # tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)#line_scale=10, # print('读取成功') # print("检测到的表格数量:", tables.n) # except FileNotFoundError as fe: # print(fe) # except Exception as e: # print(f'处理PDF时出错: {e}') # for t in tables: # top = t._bbox[3] # buttom = t._bbox[1] # page_num = int(t.page) # table_index = int(t.order) # arr = np.array(t.data) # #recent_value = None # #这里开始对可能解析错误的值做判断: # for i, row in enumerate(arr): # if len(row) >= 4: # # first_value = row[0] # # if ("2023年度" in first_value or "2022年度" in first_value) and len(first_value) <= 12: # # recent_value = first_value # # if first_value == '' and recent_value: # # row[0] = recent_value # # 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】 # if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''): # split_values = row[2].split('.') # # 确保可以正确拆分成三个数值 # if len(split_values) == 4: # new_value1 = f"{split_values[0]}.{split_values[1][:2]}" # new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}" # new_value3 = f"{split_values[2][2:]}.{split_values[3]}" # row[1] = new_value1 # row[2] = new_value2 # row[3] = new_value3 # #检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】 # if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2: # split_value_3 = row[2].split('.') # split_value_5 = row[4].split('.') # if len(split_value_3) == 3: # new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}" # new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}" # if len(split_value_5) == 3: # new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}" # new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}" # row[1] = new_value2 # row[2] = new_value3 # row[3] = new_value4 # row[4] = new_value5 # #检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】 # if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()): # split_values = row[2].split('.') # if len(split_values) == 3: # new_value2 = f"{split_values[0]}.{split_values[1][:2]}" # new_value3 = f"{split_values[1][2:]}.{split_values[2]}" # row[1] = new_value2 # row[2] = new_value3 # #检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分 # if not row[0].replace('.', '', 1).isdigit(): # for i in range(1, len(row) - 1): # if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2: # split_values = row[i + 1].split('%') # new_value1 = f"{split_values[0]}%" # new_value2 = f"{split_values[1]}" # row[i] = new_value1 # row[i + 1] = new_value2 # break # #检查条件:当一个列表中同时出现了2022年12月31日和2023年1月1日时【并且都只出现1次】,在2022年12月31日后面增加“调整前”字段 # # if sum(1 for item in row if item.strip() == "2023年1月1日") == 1 and sum(1 for item in row if item.strip() == "2022年12月31日") == 1: # # for i, item in enumerate(row): # # stripped_item = item.strip() #去空格 # # if stripped_item == "2022年12月31日": # # row[i] = stripped_item + '调整前' # new_data = arr.tolist()#用于后面保存到数据库中 # rows, cols = arr.shape # if rows == 1 and cols == 1: # continue # arr_str = ''.join([''.join(map(str, row)) for row in arr]) # #print(f'arr_str的值是 {arr_str}') # #过滤掉不包含需抽取指标表格的文本 # matches = re.findall(STR_PATTERN, arr_str) # pattern = re.findall(PATTERN,arr_str) # muilt_pattern = re.findall(MUILT_PATTERN,arr_str) # if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5: # if not tables_range.get(page_num): # tables_range[page_num] = [] # tables_range[page_num].append({ # 'top' : top, # 'buttom' : buttom, # 'table_index' : table_index, # 'page_num' : page_num, # }) # print(f"tables_range的值是{tables_range}") # #(f'file_id是{file_id}') # # db_service.insert_pdf_parse_process({ # # 'file_id': file_id, # # 'page_num' : page_num, # # 'page_count' : 100, # # 'type' : 'parse_table', # # 'content':{ # # 'top' : top, # # 'buttom' : buttom, # # 'page_num' : page_num, # # 'table_index' : table_index, # # "type" : "table", # # "data" : new_data, # # 'sort_num' : page_num*1000 - top # # }},conn,cursor) # #get_text_content(file_path, file_id, tables_range, pages, conn, cursor) # # cursor.close() # # conn.close() # end = time.time() # print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start))) def get_table_range_test(file_path, file_id, pages, tables_range): logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid())) start = time.time() # conn = mysql.connector.connect( # host= MYSQL_HOST, # user= MYSQL_USER, # password= MYSQL_PASSWORD, # database= MYSQL_DB # ) # 创建一个cursor对象来执行SQL语句 #cursor = conn.cursor() #redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6) tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h']) for t in tables: top = t._bbox[3] buttom = t._bbox[1] page_num = int(t.page) table_index = int(t.order) arr = np.array(t.data) #这里开始对可能解析错误的值做判断: for i, row in enumerate(arr): if len(row) >= 4: # 检查条件:第一列不为数字,第二列和第四列为空,第三列有三个小数点【三列的数字被识别到一起了】 if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''): split_values = row[2].split('.') # 确保可以正确拆分成三个数值 if len(split_values) == 4: new_value1 = f"{split_values[0]}.{split_values[1][:2]}" new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}" new_value3 = f"{split_values[2][2:]}.{split_values[3]}" row[1] = new_value1 row[2] = new_value2 row[3] = new_value3 #检查条件:第一列不为数字,第二列第四列为空,第三列两个小数点,第五列两个小数点【两列的数字被识别到一起了】 if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and len(row[4].rsplit('.', 1)[-1]) == 2: split_value_3 = row[2].split('.') split_value_5 = row[4].split('.') if len(split_value_3) == 3: new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}" new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}" if len(split_value_5) == 3: new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}" new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}" row[1] = new_value2 row[2] = new_value3 row[3] = new_value4 row[4] = new_value5 #检查条件:第一列不为数字,第二列为空,第三列有两个小数点,第四列为正常数字【两列的数字被识别到一起了】 if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()): split_values = row[2].split('.') if len(split_values) == 3: new_value2 = f"{split_values[0]}.{split_values[1][:2]}" new_value3 = f"{split_values[1][2:]}.{split_values[2]}" row[1] = new_value2 row[2] = new_value3 #检查条件:第一列不位数字,后面有一列中的值存在“%”并且"%"不是结尾,就进行拆分 if not row[0].replace('.', '', 1).isdigit(): for i in range(1, len(row) - 1): if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2: split_values = row[i + 1].split('%') new_value1 = f"{split_values[0]}%" new_value2 = f"{split_values[1]}" row[i] = new_value1 row[i + 1] = new_value2 break new_data = arr.tolist()#用于后面保存到数据库中 rows, cols = arr.shape if rows == 1 and cols == 1: continue arr_str = ''.join([''.join(map(str, row)) for row in arr]) #过滤掉不包含需抽取指标表格的文本 matches = re.findall(STR_PATTERN, arr_str) pattern = re.findall(PATTERN,arr_str) muilt_pattern = re.findall(MUILT_PATTERN,arr_str) if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5: if not tables_range.get(page_num): tables_range[page_num] = [] tables_range[page_num].append({ 'top' : top, 'buttom' : buttom, 'table_index' : table_index, 'page_num' : page_num, }) logger.debug(f"tables_range的值是{tables_range}") # db_service.insert_pdf_parse_process({ # 'file_id': file_id, # 'page_num' : page_num, # 'page_count' : 100, # 'type' : 'parse_table', # 'content':{ # 'top' : top, # 'buttom' : buttom, # 'page_num' : page_num, # 'table_index' : table_index, # "type" : "table", # "data" : new_data, # 'sort_num' : page_num*1000 - top # }},conn,cursor) # get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client) # cursor.close() # conn.close() # redis_client.close() end = time.time() logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start))) get_table_range_test(file_path, file_id, pages, tables_range)