152 lines
4.7 KiB
Python
152 lines
4.7 KiB
Python
|
import camelot
|
|||
|
import re
|
|||
|
from multiprocessing import Pool
|
|||
|
import os, time, random
|
|||
|
import json
|
|||
|
from config import MYSQL_HOST,MYSQL_USER,MYSQL_PASSWORD,MYSQL_DB
|
|||
|
from datetime import datetime
|
|||
|
# 读取PDF
|
|||
|
import PyPDF2
|
|||
|
# 分析PDF的layout,提取文本
|
|||
|
from pdfminer.high_level import extract_pages
|
|||
|
from pdfminer.layout import LTTextBoxHorizontal
|
|||
|
import pdfplumber
|
|||
|
import mysql.connector
|
|||
|
import db_service
|
|||
|
from multiprocessing import Process
|
|||
|
from config import REDIS_HOST,REDIS_PORT,REDIS_PASSWORD
|
|||
|
import utils
|
|||
|
|
|||
|
def text_in_table(top, tables_range, page_num):
|
|||
|
if tables_range.get(page_num):
|
|||
|
for range in tables_range[page_num]:
|
|||
|
if top < range['top'] and top > range['buttom']:
|
|||
|
return True
|
|||
|
return False
|
|||
|
|
|||
|
def get_text_type(text: str):
|
|||
|
text = re.sub(r"\s", "", text)
|
|||
|
first_re = '年度报告'
|
|||
|
page_number_pattern = re.compile(r'^\d+(/\d+)?$')
|
|||
|
|
|||
|
if re.search(first_re, text.strip()):
|
|||
|
return 'page_header'
|
|||
|
|
|||
|
if page_number_pattern.match(text.strip()):
|
|||
|
return 'page_footer'
|
|||
|
|
|||
|
if len(text) < 20 and text.endswith('页'):
|
|||
|
return 'page_footer'
|
|||
|
|
|||
|
return 'text'
|
|||
|
|
|||
|
# 读取pdf文件中文本内容,不包括表格
|
|||
|
def get_text_content(pdf_path,file_id,tables_range,conn,cursor):
|
|||
|
"""
|
|||
|
:return: 返回pdf文件中文本内容,不包括表格
|
|||
|
"""
|
|||
|
# 我们从PDF中提取页面,page_numbers=[4,5,6]
|
|||
|
for pagenum, page in enumerate(extract_pages(pdf_path)):
|
|||
|
try:
|
|||
|
# 找到所有的元素
|
|||
|
page_elements = [(element.y1, element) for element in page._objs]
|
|||
|
# 查找组成页面的元素
|
|||
|
for i,component in enumerate(page_elements):
|
|||
|
try:
|
|||
|
# 提取页面布局的元素
|
|||
|
element = component[1]
|
|||
|
# 检查该元素是否为文本元素
|
|||
|
if isinstance(element, LTTextBoxHorizontal):
|
|||
|
|
|||
|
# element_top = element.bbox[3]
|
|||
|
print(element)
|
|||
|
line_text = element.get_text().replace('\n','')
|
|||
|
line_text = re.sub(r"\s", "", line_text)
|
|||
|
if delete_flag(line_text):
|
|||
|
continue
|
|||
|
|
|||
|
# if not text_in_table(element_top, tables_range, pagenum+1):
|
|||
|
db_service.insert_pdf_text_info({
|
|||
|
'file_id': file_id,
|
|||
|
'page_num' : pagenum+1,
|
|||
|
'text' : line_text
|
|||
|
},conn,cursor)
|
|||
|
except Exception as e:
|
|||
|
print(f'{pagenum}页{i}处理异常')
|
|||
|
print(e)
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(f'{pagenum}页处理异常')
|
|||
|
print(e)
|
|||
|
|
|||
|
def delete_flag(text : str):
|
|||
|
if utils.under_non_alpha_ratio(text):
|
|||
|
return True
|
|||
|
|
|||
|
if not re.findall(',|,|。|、|(|)',text):
|
|||
|
return True
|
|||
|
|
|||
|
if text.find('适用') != -1 and text.find('不适用') != -1:
|
|||
|
return True
|
|||
|
|
|||
|
if text.find('是') != -1 and text.find('否') != -1:
|
|||
|
return True
|
|||
|
|
|||
|
return False
|
|||
|
|
|||
|
def get_table_range(file_path, file_id, pages, tables_range):
|
|||
|
|
|||
|
print('Run task %s (%s)...' % (f'解析表格{pages}', os.getpid()))
|
|||
|
start = time.time()
|
|||
|
|
|||
|
conn = mysql.connector.connect(
|
|||
|
host= MYSQL_HOST,
|
|||
|
user= MYSQL_USER,
|
|||
|
password= MYSQL_PASSWORD,
|
|||
|
database= MYSQL_DB
|
|||
|
)
|
|||
|
|
|||
|
# 创建一个cursor对象来执行SQL语句
|
|||
|
cursor = conn.cursor(buffered=True)
|
|||
|
|
|||
|
tables = camelot.read_pdf(file_path, pages=pages, strip_text=',\n', copy_text=['v','h'],shift_text = ['l'])
|
|||
|
for t in tables:
|
|||
|
|
|||
|
top = t._bbox[3]
|
|||
|
buttom = t._bbox[1]
|
|||
|
page_num = int(t.page)
|
|||
|
table_index = int(t.order)
|
|||
|
|
|||
|
if not tables_range.get(page_num):
|
|||
|
tables_range[page_num] = []
|
|||
|
|
|||
|
tables_range[page_num].append({
|
|||
|
'top' : top,
|
|||
|
'buttom' : buttom,
|
|||
|
'table_index' : table_index,
|
|||
|
'page_num' : page_num,
|
|||
|
})
|
|||
|
|
|||
|
get_text_content(file_path, file_id, tables_range, conn, cursor)
|
|||
|
|
|||
|
cursor.close()
|
|||
|
conn.close()
|
|||
|
|
|||
|
end = time.time()
|
|||
|
print('Task %s runs %0.2f seconds.' % (f'解析表格{pages}', (end - start)))
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
path = "/Users/zhengfei/Desktop/cb/002315-2023-nb-nb.pdf"
|
|||
|
# get_text_content(path,'111')
|
|||
|
# get_table_measure(path,'all','111')
|
|||
|
#print(pdf_data)
|
|||
|
# pdf_info = []
|
|||
|
tables_range = {}
|
|||
|
get_table_range(path, '5555', 'all', tables_range)
|
|||
|
|
|||
|
# sorted_pdf_info = sorted(pdf_info, key=lambda k: k['sort_num'])
|
|||
|
|
|||
|
# pdf_tables = merge_consecutive_arrays(sorted_pdf_info)
|
|||
|
# for table in pdf_tables:
|
|||
|
# print(table)#修改测试
|