122 lines
4.5 KiB
Python
122 lines
4.5 KiB
Python
from docx import Document
|
|
import json
|
|
from docx.oxml.table import CT_Tbl
|
|
from docx.oxml.text.paragraph import CT_P
|
|
from lxml import etree
|
|
import os
|
|
|
|
RESULT_TYPE_TEXT = 'text'
|
|
RESULT_TYPE_TABLE = 'table'
|
|
|
|
def build_result(result_type, index, data):
|
|
return {
|
|
'type': result_type,
|
|
'index': index,
|
|
'data': data
|
|
}
|
|
|
|
def build_catalog_result(index, depth, data):
|
|
return {
|
|
'index': index,
|
|
'depth': depth,
|
|
'data': data
|
|
}
|
|
|
|
def parse_paragraph(paragraph, index):
|
|
paragraph_text = paragraph.text.strip() if paragraph else ''
|
|
if paragraph_text:
|
|
return build_result(RESULT_TYPE_TEXT, index, paragraph_text)
|
|
return None
|
|
|
|
def parse_table(table, index):
|
|
table_data = []
|
|
for row in table.rows:
|
|
row_data = [cell.text for cell in row.cells]
|
|
table_data.append(row_data)
|
|
return build_result(RESULT_TYPE_TABLE, index, table_data)
|
|
|
|
def parse_docx(docx_path):
|
|
try:
|
|
document = Document(docx_path)
|
|
except Exception as e:
|
|
print(f"Error loading document: {e}")
|
|
|
|
doc_content = [] # 内容(文本+表格)
|
|
catalog_content = [] # 目录
|
|
current_index = 1 # 维护全局的 index 变量
|
|
paragraph_index = 0
|
|
table_index = 0
|
|
# 获取整个文档的XML内容
|
|
xml_root = document.part.element
|
|
namespaces = xml_root.nsmap
|
|
|
|
# 遍历文档中的所有元素
|
|
for i, element in enumerate(document.element.body):
|
|
if element.tag.endswith('p'): # 段落
|
|
# 插入段落内容
|
|
paragraph = document.paragraphs[paragraph_index]
|
|
paragraph_index += 1
|
|
paragraph_result = parse_paragraph(paragraph, current_index)
|
|
if paragraph_result:
|
|
doc_content.append(paragraph_result)
|
|
# 判断是否为目录,是就插入目录内容
|
|
p_element = paragraph._element
|
|
# 将docx的元素转换为lxml的元素
|
|
p_element = etree.fromstring(p_element.xml)
|
|
outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces)
|
|
if outlineLvl:
|
|
level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val'))
|
|
text = paragraph.text
|
|
catalog_content.append(build_catalog_result(current_index, level + 1, text))
|
|
else:
|
|
style_name = paragraph.style.name
|
|
if style_name.startswith('Heading'):
|
|
level = int(style_name[-1])
|
|
text = paragraph.text
|
|
catalog_content.append(build_catalog_result(current_index, level + 1, text))
|
|
current_index += 1 # 更新 index
|
|
|
|
# 判断是否表格内容
|
|
elif element.tag.endswith('tbl'):
|
|
table = document.tables[table_index]
|
|
table_index += 1
|
|
table_result = parse_table(table, current_index)
|
|
if table_result:
|
|
doc_content.append(table_result)
|
|
current_index += 1 # 更新 index
|
|
|
|
return json.dumps(doc_content, indent=4, ensure_ascii=False),json.dumps(catalog_content, indent=4, ensure_ascii=False)
|
|
|
|
def split_text_table(json_data):
|
|
# 分组
|
|
text_elements = [element for element in json_data if element['type'] == 'text']
|
|
table_elements = [element for element in json_data if element['type'] == 'table']
|
|
|
|
# 转换为JSON字符串
|
|
text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4)
|
|
table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4)
|
|
|
|
return text_elements_json, table_elements_json
|
|
|
|
|
|
def append_to_file(file_path, text):
|
|
try:
|
|
with open(file_path, 'a', encoding='utf-8') as file:
|
|
file.write(text + '\n')
|
|
except Exception as e:
|
|
print(f"Error writing to file: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
current_directory = os.getcwd()
|
|
docx_relative_path = 'file/docx/1.docx'
|
|
file_relative_path = 'file/docx/test.txt'
|
|
docx_path = os.path.join(current_directory, docx_relative_path)
|
|
file_path = os.path.join(current_directory, file_relative_path)
|
|
|
|
parsed_content,catalog_content = parse_docx(docx_path)
|
|
json_parsed_content = json.loads(parsed_content)
|
|
text_elements_json, table_elements_json = split_text_table(json_parsed_content)
|
|
|
|
append_to_file(file_path, text_elements_json)
|
|
append_to_file(file_path, table_elements_json)
|
|
append_to_file(file_path, catalog_content) |