pdf_code/zzb_data_prod/test_0711_v2.py

import camelot
import re
#from multiprocessing import Pool
import os, time, random
import json
#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
from datetime import datetime
# 读取PDF
import PyPDF2
# 分析PDF的layout，提取文本
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBoxHorizontal
import pdfplumber
import mysql.connector
#import utils
from pymilvus import MilvusClient
#import llm_service
#import db_service
#import pdf_title
import numpy as np
#from multiprocessing import Process
import logging
logger = logging.getLogger(__name__)

STR_PATTERN = '营业收入|净利润|变动比例|损益|现金流量净额|现金净流量|现金流|每股收益|总资产|资产总额|收益率|货币资金|应收账款|存货|固定资产|在建工程|商誉|短期借款|应付账款|合同负债|长期借款|营业成本|销售费用|管理费用|财务费用|研发费用|研发投入'
#负责表内一旦出现某个字符，整个表丢弃
PATTERN = '品牌类型|分门店|销售渠道|行业名称|产品名称|地区名称|子公司名称|业绩快报|调整情况说明|调整年初资产负债表|计入当期损益的政府补助|主要子公司|分部|母公司资产负债表|显示服务|渠道|商品类型|合同分类|会计政策变更|地区分类'
#unit_pattern = re.compile(r'单位[：|:]?(百万元|千万元|亿元|万元|千元|元)')
MUILT_PATTERN = '调整前'
file_path  =  r"combined_v61.pdf"
file_id = 1
pages = '1-2'
tables_range = {}
# def get_table_range_test(file_path, file_id, pages, tables_range):

#     print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
#     #(f'file_path: {file_path},file_id:{file_id},pages:{pages},tables_range:{tables_range}')
#     start = time.time()
#     import tempfile
#     temp_dir_path = "F:\\temp"

#     # 检查并创建临时文件夹
#     if not os.path.exists(temp_dir_path):
#         os.makedirs(temp_dir_path)

#     # 创建临时文件夹
#     temp_dir = tempfile.mkdtemp(prefix="camelot_temp_", dir=temp_dir_path)
#     # 设置全局临时文件夹路径
#     os.environ["TMP"] = temp_dir
#     os.environ["TEMP"] = temp_dir
#     # conn = mysql.connector.connect(
#     #     host= MYSQL_HOST,
#     #     user= MYSQL_USER,
#     #     password= MYSQL_PASSWORD,
#     #     database= MYSQL_DB
#     # )

#     # 创建一个cursor对象来执行SQL语句
#     #print(f'file_path的值是{file_path}')
#     #cursor = conn.cursor()
#     # try:
#     #     tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
#     #     print('读取成功')
#     # except Exception as e:
#     #     print(f'错误在{e}')
#     #print(f'file_path的值是{file_path}')
#     #file_path = "F:\\11_pdf\\688670-2023-nb-nb.pdf"
#     os.environ["GHOSTSCRIPT_BINARY"] = "gswin64c"
#     try:
#     # 确保 file_path 是正确的，并且文件是可访问的
#         if not os.path.exists(file_path):
#             print(f'文件路径不正确或文件不存在: {file_path}')
#             raise FileNotFoundError(f"文件不存在：{file_path}")
#         else:
#             pass#(f'file_path是存在的就是{file_path}')

#         # 读取 PDF 文件
#         #tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n')#, copy_text=['h']
#         #tables = camelot.read_pdf(file_path, pages=pages, flavor='lattice', strip_text=' ,\n', temp_dir=temp_dir)
#         tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'], temp_dir=temp_dir)#line_scale=10,
        
#         print('读取成功')
#         print("检测到的表格数量：", tables.n)
#     except FileNotFoundError as fe:
#         print(fe)
#     except Exception as e:
#         print(f'处理PDF时出错: {e}')
#     for t in tables:
        
#         top = t._bbox[3]
#         buttom = t._bbox[1]
#         page_num = int(t.page)
#         table_index = int(t.order)
#         arr = np.array(t.data)
#         #recent_value = None  
#         #这里开始对可能解析错误的值做判断：
#         for i, row in enumerate(arr):
#             if len(row) >= 4:
#                 # first_value = row[0]
#                 # if ("2023年度" in first_value or "2022年度" in first_value) and len(first_value) <= 12:
#                 #     recent_value = first_value
#                 # if first_value == '' and recent_value:
#                 #     row[0] = recent_value
#                 # 检查条件:第一列不为数字，第二列和第四列为空，第三列有三个小数点【三列的数字被识别到一起了】
#                 if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
#                     split_values = row[2].split('.')
#                     # 确保可以正确拆分成三个数值
#                     if len(split_values) == 4:
#                         new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
#                         new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
#                         new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
#                         row[1] = new_value1
#                         row[2] = new_value2
#                         row[3] = new_value3
#                 #检查条件：第一列不为数字，第二列第四列为空，第三列两个小数点，第五列两个小数点【两列的数字被识别到一起了】
#                 if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2  and len(row[4].rsplit('.', 1)[-1]) == 2:
#                     split_value_3 = row[2].split('.')
#                     split_value_5 = row[4].split('.')
                    
#                     if len(split_value_3) == 3:
#                         new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
#                         new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
                    
#                     if len(split_value_5) == 3:
#                         new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
#                         new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
                    
#                     row[1] = new_value2
#                     row[2] = new_value3
#                     row[3] = new_value4
#                     row[4] = new_value5
#                 #检查条件：第一列不为数字，第二列为空，第三列有两个小数点，第四列为正常数字【两列的数字被识别到一起了】
#                 if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
#                     split_values = row[2].split('.')
#                     if len(split_values) == 3:
#                         new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
#                         new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
#                         row[1] = new_value2
#                         row[2] = new_value3
#                 #检查条件：第一列不位数字，后面有一列中的值存在“%”并且"%"不是结尾，就进行拆分
#                 if not row[0].replace('.', '', 1).isdigit():
#                     for i in range(1, len(row) - 1):
#                         if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
#                             split_values = row[i + 1].split('%')
#                             new_value1 = f"{split_values[0]}%"
#                             new_value2 = f"{split_values[1]}"
#                             row[i] = new_value1
#                             row[i + 1] = new_value2
#                             break 
                
#                 #检查条件：当一个列表中同时出现了2022年12月31日和2023年1月1日时【并且都只出现1次】，在2022年12月31日后面增加“调整前”字段
#                 # if sum(1 for item in row if item.strip() == "2023年1月1日") == 1 and sum(1 for item in row if item.strip() == "2022年12月31日") == 1:
#                 #     for i, item in enumerate(row):
#                 #         stripped_item = item.strip() #去空格
#                 #         if stripped_item == "2022年12月31日":
#                 #             row[i] = stripped_item + '调整前'

#         new_data = arr.tolist()#用于后面保存到数据库中

        
#         rows, cols = arr.shape
#         if rows == 1 and cols == 1:
#             continue          
#         arr_str = ''.join([''.join(map(str, row)) for row in arr])
#         #print(f'arr_str的值是  {arr_str}')
#         #过滤掉不包含需抽取指标表格的文本
#         matches = re.findall(STR_PATTERN, arr_str)
#         pattern = re.findall(PATTERN,arr_str)
#         muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
#         if len(matches) > 0 and len(pattern) == 0 and len(muilt_pattern)<5:
#             if not tables_range.get(page_num):
#                 tables_range[page_num] = []
            
#             tables_range[page_num].append({
#                 'top' : top,
#                 'buttom' : buttom,
#                 'table_index' : table_index,
#                 'page_num' : page_num,
#             })
#         print(f"tables_range的值是{tables_range}")
#             #(f'file_id是{file_id}')

#             # db_service.insert_pdf_parse_process({
#             # 'file_id': file_id,
#             # 'page_num' : page_num,
#             # 'page_count' : 100,
#             # 'type' : 'parse_table',
#             # 'content':{  
#             #     'top' : top,
#             #     'buttom' : buttom,
#             #     'page_num' : page_num,
#             #     'table_index' : table_index,
#             #     "type" : "table",
#             #     "data" : new_data,
#             #     'sort_num' : page_num*1000 - top
#             # }},conn,cursor)
    
#     #get_text_content(file_path, file_id, tables_range, pages, conn, cursor)

#     # cursor.close()
#     # conn.close()

#     end = time.time()
#     print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
def get_table_range_test(file_path, file_id, pages, tables_range):

    logger.info('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
    start = time.time()

    # conn = mysql.connector.connect(
    #     host= MYSQL_HOST,
    #     user= MYSQL_USER,
    #     password= MYSQL_PASSWORD,
    #     database= MYSQL_DB
    # )

    # 创建一个cursor对象来执行SQL语句
    #cursor = conn.cursor()

    #redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db=6)

    tables = camelot.read_pdf(file_path, pages=pages, strip_text=' ,\n', copy_text=['h'])
    for t in tables:
        
        top = t._bbox[3]
        buttom = t._bbox[1]
        page_num = int(t.page)
        table_index = int(t.order)
        arr = np.array(t.data)
        #这里开始对可能解析错误的值做判断：
        for i, row in enumerate(arr):
            if len(row) >= 4:
                # 检查条件:第一列不为数字，第二列和第四列为空，第三列有三个小数点【三列的数字被识别到一起了】
                if (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 4 and len(row[2].rsplit('.', 1)[-1]) == 2) and (row[3] == ''):
                    split_values = row[2].split('.')
                    # 确保可以正确拆分成三个数值
                    if len(split_values) == 4:
                        new_value1 = f"{split_values[0]}.{split_values[1][:2]}"
                        new_value2 = f"{split_values[1][2:]}.{split_values[2][:2]}"
                        new_value3 = f"{split_values[2][2:]}.{split_values[3]}"
                        row[1] = new_value1
                        row[2] = new_value2
                        row[3] = new_value3
                #检查条件：第一列不为数字，第二列第四列为空，第三列两个小数点，第五列两个小数点【两列的数字被识别到一起了】
                if len(row) >= 5 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and (row[3] == '') and (len(row[4].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2  and len(row[4].rsplit('.', 1)[-1]) == 2:
                    split_value_3 = row[2].split('.')
                    split_value_5 = row[4].split('.')
                    
                    if len(split_value_3) == 3:
                        new_value2 = f"{split_value_3[0]}.{split_value_3[1][:2]}"
                        new_value3 = f"{split_value_3[1][2:]}.{split_value_3[2]}"
                    
                    if len(split_value_5) == 3:
                        new_value4 = f"{split_value_5[0]}.{split_value_5[1][:2]}"
                        new_value5 = f"{split_value_5[1][2:]}.{split_value_5[2]}"
                    
                    row[1] = new_value2
                    row[2] = new_value3
                    row[3] = new_value4
                    row[4] = new_value5
                #检查条件：第一列不为数字，第二列为空，第三列有两个小数点，第四列为正常数字【两列的数字被识别到一起了】
                if len(row) >= 4 and (not row[0].replace('.', '', 1).isdigit()) and (row[1] == '') and (len(row[2].split('.')) == 3) and len(row[2].rsplit('.', 1)[-1]) == 2 and (row[3].replace('-', '', 1).replace('.', '', 1).isdigit()):
                    split_values = row[2].split('.')
                    if len(split_values) == 3:
                        new_value2 = f"{split_values[0]}.{split_values[1][:2]}"
                        new_value3 = f"{split_values[1][2:]}.{split_values[2]}"
                        row[1] = new_value2
                        row[2] = new_value3
                #检查条件：第一列不位数字，后面有一列中的值存在“%”并且"%"不是结尾，就进行拆分
                if not row[0].replace('.', '', 1).isdigit():
                    for i in range(1, len(row) - 1):
                        if row[i] == '' and '%' in row[i + 1] and len(row[i + 1].split('%')) == 2:
                            split_values = row[i + 1].split('%')
                            new_value1 = f"{split_values[0]}%"
                            new_value2 = f"{split_values[1]}"
                            row[i] = new_value1
                            row[i + 1] = new_value2
                            break 

        new_data = arr.tolist()#用于后面保存到数据库中
        rows, cols = arr.shape
        if rows == 1 and cols == 1:
            continue          
        arr_str = ''.join([''.join(map(str, row)) for row in arr])
        
        #过滤掉不包含需抽取指标表格的文本
        matches = re.findall(STR_PATTERN, arr_str)
        pattern = re.findall(PATTERN,arr_str)
        muilt_pattern = re.findall(MUILT_PATTERN,arr_str)
        if len(matches) > 0  and len(pattern) == 0 and len(muilt_pattern)<5:
            if not tables_range.get(page_num):
                tables_range[page_num] = []
            
            tables_range[page_num].append({
                'top' : top,
                'buttom' : buttom,
                'table_index' : table_index,
                'page_num' : page_num,
            })
            logger.debug(f"tables_range的值是{tables_range}")

    #         db_service.insert_pdf_parse_process({
    #         'file_id': file_id,
    #         'page_num' : page_num,
    #         'page_count' : 100,
    #         'type' : 'parse_table',
    #         'content':{  
    #             'top' : top,
    #             'buttom' : buttom,
    #             'page_num' : page_num,
    #             'table_index' : table_index,
    #             "type" : "table",
    #             "data" : new_data,
    #             'sort_num' : page_num*1000 - top
    #         }},conn,cursor)
    
    # get_text_content(file_path, file_id, tables_range, pages, conn, cursor, redis_client)

    # cursor.close()
    # conn.close()
    # redis_client.close()

    end = time.time()
    logger.info('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))


get_table_range_test(file_path, file_id, pages, tables_range)