pdf_code/zzb_data/parse_word/parse_word.py

122 lines
4.5 KiB
Python

from docx import Document
import json
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from lxml import etree
import os
RESULT_TYPE_TEXT = 'text'
RESULT_TYPE_TABLE = 'table'
def build_result(result_type, index, data):
return {
'type': result_type,
'index': index,
'data': data
}
def build_catalog_result(index, depth, data):
return {
'index': index,
'depth': depth,
'data': data
}
def parse_paragraph(paragraph, index):
paragraph_text = paragraph.text.strip() if paragraph else ''
if paragraph_text:
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
return None
def parse_table(table, index):
table_data = []
for row in table.rows:
row_data = [cell.text for cell in row.cells]
table_data.append(row_data)
return build_result(RESULT_TYPE_TABLE, index, table_data)
def parse_docx(docx_path):
try:
document = Document(docx_path)
except Exception as e:
print(f"Error loading document: {e}")
doc_content = [] # 内容(文本+表格)
catalog_content = [] # 目录
current_index = 1 # 维护全局的 index 变量
paragraph_index = 0
table_index = 0
# 获取整个文档的XML内容
xml_root = document.part.element
namespaces = xml_root.nsmap
# 遍历文档中的所有元素
for i, element in enumerate(document.element.body):
if element.tag.endswith('p'): # 段落
# 插入段落内容
paragraph = document.paragraphs[paragraph_index]
paragraph_index += 1
paragraph_result = parse_paragraph(paragraph, current_index)
if paragraph_result:
doc_content.append(paragraph_result)
# 判断是否为目录,是就插入目录内容
p_element = paragraph._element
# 将docx的元素转换为lxml的元素
p_element = etree.fromstring(p_element.xml)
outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
if outlineLvl:
level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
text = paragraph.text
catalog_content.append(build_catalog_result(current_index, level + 1, text))
else:
style_name = paragraph.style.name
if style_name.startswith('Heading'):
level = int(style_name[-1])
text = paragraph.text
catalog_content.append(build_catalog_result(current_index, level + 1, text))
current_index += 1 # 更新 index
# 判断是否表格内容
elif element.tag.endswith('tbl'):
table = document.tables[table_index]
table_index += 1
table_result = parse_table(table, current_index)
if table_result:
doc_content.append(table_result)
current_index += 1 # 更新 index
return json.dumps(doc_content, indent=4, ensure_ascii=False),json.dumps(catalog_content, indent=4, ensure_ascii=False)
def split_text_table(json_data):
# 分组
text_elements = [element for element in json_data if element['type'] == 'text']
table_elements = [element for element in json_data if element['type'] == 'table']
# 转换为JSON字符串
text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
return text_elements_json, table_elements_json
def append_to_file(file_path, text):
try:
with open(file_path, 'a', encoding='utf-8') as file:
file.write(text + '\n')
except Exception as e:
print(f"Error writing to file: {e}")
if __name__ == "__main__":
current_directory = os.getcwd()
docx_relative_path = 'file/docx/1.docx'
file_relative_path = 'file/docx/test.txt'
docx_path = os.path.join(current_directory, docx_relative_path)
file_path = os.path.join(current_directory, file_relative_path)
parsed_content,catalog_content = parse_docx(docx_path)
json_parsed_content = json.loads(parsed_content)
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
append_to_file(file_path, text_elements_json)
append_to_file(file_path, table_elements_json)
append_to_file(file_path, catalog_content)