#import camelot import re #from multiprocessing import Pool import os, time, random import json #from config import MILVUS_CLIENT,MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB,MEASURE_COUNT from datetime import datetime # 读取PDF import PyPDF2 # 分析PDF的layout,提取文本 from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextBoxHorizontal import pdfplumber import mysql.connector #import utils from pymilvus import MilvusClient #import llm_service #import db_service #import pdf_title import numpy as np #from multiprocessing import Process import logging logger = logging.getLogger(__name__) def text_in_table(top, tables_range, page_num): if tables_range.get(page_num): for range in tables_range[page_num]: if top < range['top'] and top > range['buttom']: return True return False def get_text_type(text: str): text = re.sub(r"\s", "", text) first_re = '年度报告' page_number_pattern = re.compile(r'^\d+(/\d+)?$') if re.search(first_re, text.strip()): return 'page_header' if page_number_pattern.match(text.strip()): return 'page_footer' return 'text' def get_text_content_test(file_path,file_id,pages,tables_range): page_start = pages.split('-')[0] page_end = pages.split('-')[1] # 我们从PDF中提取页面,page_numbers=[4,5,6] for pagenum, page in enumerate(extract_pages(pdf_path)): try: if pagenum+1 < int(page_start) or pagenum+1 > int(page_end): continue # 找到所有的元素 page_elements = [(element.y1, element) for element in page._objs] # 查找组成页面的元素 for i,component in enumerate(page_elements): # 提取页面布局的元素 element = component[1] # 检查该元素是否为文本元素 if isinstance(element, LTTextBoxHorizontal): # 检查文本是否出现在表中 line_text = element.get_text().replace('\n','') line_text = re.sub(r"\s", "", line_text) #print(f'line_text 的值是{line_text}') element_top = element.bbox[3] element_buttom = element.bbox[1] # 检查该文本是否出现在表中 if tables_range.get(pagenum+1): for range in tables_range[pagenum+1]: if element_top < range['top'] and element_top > range['buttom']: pass else: if element_top - range['top'] < 150 and element_top - range['top'] > 5 and not text_in_table(element_top, tables_range, pagenum+1): text_type = get_text_type(line_text) if text_type == 'page_header': break # 记录需要过滤掉的页码 if len(re.findall('母公司|现金流量表补充', line_text)) > 0: logger.info('成功识别到了') except Exception as e: logger.error(f"Error processing page {pagenum+1}: {e}") pdf_path = r"combined_v61.pdf" file_id = 1 tables_range = {1: [{'top': 727.0118072976055, 'buttom': 77.52552451539339, 'table_index': 1, 'page_num': 1}], 2: [{'top': 687.408985176739, 'buttom': 77.04549030786774, 'table_index': 1, 'page_num': 2}]} pages = '1-2' get_text_content_test(pdf_path,file_id,pages,tables_range)