from docx import Document import json from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from lxml import etree import os RESULT_TYPE_TEXT = 'text' RESULT_TYPE_TABLE = 'table' def build_result(result_type, index, data): return { 'type': result_type, 'index': index, 'data': data } def build_catalog_result(index, depth, data): return { 'index': index, 'depth': depth, 'data': data } def parse_paragraph(paragraph, index): paragraph_text = paragraph.text.strip() if paragraph else '' if paragraph_text: return build_result(RESULT_TYPE_TEXT, index, paragraph_text) return None def parse_table(table, index): table_data = [] for row in table.rows: row_data = [cell.text for cell in row.cells] table_data.append(row_data) return build_result(RESULT_TYPE_TABLE, index, table_data) def parse_docx(docx_path): try: document = Document(docx_path) except Exception as e: print(f"Error loading document: {e}") doc_content = [] # 内容(文本+表格) catalog_content = [] # 目录 current_index = 1 # 维护全局的 index 变量 paragraph_index = 0 table_index = 0 # 获取整个文档的XML内容 xml_root = document.part.element namespaces = xml_root.nsmap # 遍历文档中的所有元素 for i, element in enumerate(document.element.body): if element.tag.endswith('p'): # 段落 # 插入段落内容 paragraph = document.paragraphs[paragraph_index] paragraph_index += 1 paragraph_result = parse_paragraph(paragraph, current_index) if paragraph_result: doc_content.append(paragraph_result) # 判断是否为目录,是就插入目录内容 p_element = paragraph._element # 将docx的元素转换为lxml的元素 p_element = etree.fromstring(p_element.xml) outlineLvl = p_element.xpath('.//w:outlineLvl', namespaces=namespaces) if outlineLvl: level = int(outlineLvl[0].get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')) text = paragraph.text catalog_content.append(build_catalog_result(current_index, level + 1, text)) else: style_name = paragraph.style.name if style_name.startswith('Heading'): level = int(style_name[-1]) text = paragraph.text catalog_content.append(build_catalog_result(current_index, level + 1, text)) current_index += 1 # 更新 index # 判断是否表格内容 elif element.tag.endswith('tbl'): table = document.tables[table_index] table_index += 1 table_result = parse_table(table, current_index) if table_result: doc_content.append(table_result) current_index += 1 # 更新 index return json.dumps(doc_content, indent=4, ensure_ascii=False),json.dumps(catalog_content, indent=4, ensure_ascii=False) def split_text_table(json_data): # 分组 text_elements = [element for element in json_data if element['type'] == 'text'] table_elements = [element for element in json_data if element['type'] == 'table'] # 转换为JSON字符串 text_elements_json = json.dumps(text_elements, ensure_ascii=False, indent=4) table_elements_json = json.dumps(table_elements, ensure_ascii=False, indent=4) return text_elements_json, table_elements_json def append_to_file(file_path, text): try: with open(file_path, 'a', encoding='utf-8') as file: file.write(text + '\n') except Exception as e: print(f"Error writing to file: {e}") if __name__ == "__main__": current_directory = os.getcwd() docx_relative_path = 'file/docx/1.docx' file_relative_path = 'file/docx/test.txt' docx_path = os.path.join(current_directory, docx_relative_path) file_path = os.path.join(current_directory, file_relative_path) parsed_content,catalog_content = parse_docx(docx_path) json_parsed_content = json.loads(parsed_content) text_elements_json, table_elements_json = split_text_table(json_parsed_content) append_to_file(file_path, text_elements_json) append_to_file(file_path, table_elements_json) append_to_file(file_path, catalog_content)