2024-11-29 15:58:06 +08:00
|
|
|
|
#import camelot
|
|
|
|
|
import re
|
|
|
|
|
#from multiprocessing import Pool
|
|
|
|
|
import os, time, random
|
|
|
|
|
import json
|
|
|
|
|
#from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
# 读取PDF
|
|
|
|
|
import PyPDF2
|
|
|
|
|
# 分析PDF的layout,提取文本
|
|
|
|
|
from pdfminer.high_level import extract_pages
|
|
|
|
|
from pdfminer.layout import LTTextBoxHorizontal
|
|
|
|
|
import pdfplumber
|
|
|
|
|
import mysql.connector
|
|
|
|
|
#import utils
|
|
|
|
|
from pymilvus import MilvusClient
|
|
|
|
|
#import llm_service
|
|
|
|
|
#import db_service
|
|
|
|
|
#import pdf_title
|
|
|
|
|
import numpy as np
|
|
|
|
|
#from multiprocessing import Process
|
2025-09-02 15:23:55 +08:00
|
|
|
|
import logging
|
|
|
|
|
logger = logging.getLogger(__name__)
|
2024-11-29 15:58:06 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def text_in_table(top, tables_range, page_num):
|
|
|
|
|
if tables_range.get(page_num):
|
|
|
|
|
for range in tables_range[page_num]:
|
|
|
|
|
if top < range['top'] and top > range['buttom']:
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
def get_text_type(text: str):
|
|
|
|
|
text = re.sub(r"\s", "", text)
|
|
|
|
|
first_re = '年度报告'
|
|
|
|
|
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
|
|
|
|
|
|
|
|
|
if re.search(first_re, text.strip()):
|
|
|
|
|
return 'page_header'
|
|
|
|
|
|
|
|
|
|
if page_number_pattern.match(text.strip()):
|
|
|
|
|
return 'page_footer'
|
|
|
|
|
|
|
|
|
|
return 'text'
|
|
|
|
|
def get_text_content_test(file_path,file_id,pages,tables_range):
|
|
|
|
|
page_start = pages.split('-')[0]
|
|
|
|
|
page_end = pages.split('-')[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 我们从PDF中提取页面,page_numbers=[4,5,6]
|
|
|
|
|
for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|
|
|
|
try:
|
|
|
|
|
if pagenum+1 < int(page_start) or pagenum+1 > int(page_end):
|
|
|
|
|
continue
|
|
|
|
|
# 找到所有的元素
|
|
|
|
|
page_elements = [(element.y1, element) for element in page._objs]
|
|
|
|
|
# 查找组成页面的元素
|
|
|
|
|
for i,component in enumerate(page_elements):
|
|
|
|
|
# 提取页面布局的元素
|
|
|
|
|
element = component[1]
|
|
|
|
|
# 检查该元素是否为文本元素
|
|
|
|
|
if isinstance(element, LTTextBoxHorizontal):
|
|
|
|
|
# 检查文本是否出现在表中
|
|
|
|
|
line_text = element.get_text().replace('\n','')
|
|
|
|
|
line_text = re.sub(r"\s", "", line_text)
|
|
|
|
|
#print(f'line_text 的值是{line_text}')
|
|
|
|
|
|
|
|
|
|
element_top = element.bbox[3]
|
|
|
|
|
element_buttom = element.bbox[1]
|
|
|
|
|
|
|
|
|
|
# 检查该文本是否出现在表中
|
|
|
|
|
if tables_range.get(pagenum+1):
|
|
|
|
|
for range in tables_range[pagenum+1]:
|
|
|
|
|
if element_top < range['top'] and element_top > range['buttom']:
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
if element_top - range['top'] < 150 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1):
|
|
|
|
|
text_type = get_text_type(line_text)
|
|
|
|
|
if text_type == 'page_header':
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# 记录需要过滤掉的页码
|
|
|
|
|
if len(re.findall('母公司|现金流量表补充', line_text)) > 0:
|
2025-09-02 15:23:55 +08:00
|
|
|
|
logger.info('成功识别到了')
|
2024-11-29 15:58:06 +08:00
|
|
|
|
except Exception as e:
|
2025-09-02 15:23:55 +08:00
|
|
|
|
logger.error(f"Error processing page {pagenum+1}: {e}")
|
2024-11-29 15:58:06 +08:00
|
|
|
|
|
|
|
|
|
pdf_path = r"combined_v61.pdf"
|
|
|
|
|
file_id = 1
|
|
|
|
|
tables_range = {1: [{'top': 727.0118072976055, 'buttom': 77.52552451539339, 'table_index': 1, 'page_num': 1}], 2: [{'top': 687.408985176739, 'buttom': 77.04549030786774, 'table_index': 1, 'page_num': 2}]}
|
|
|
|
|
pages = '1-2'
|
|
|
|
|
get_text_content_test(pdf_path,file_id,pages,tables_range)
|