批量复制提取Word中所有的表格到Excel(Python办公自动化)

import docx
import os
from win32com import client as wc
import pandas as pd

word_paths = os.getcwd()

#  doc 转化为 docx,如果不用该方法则打不开对应的docx
def convertdoc_docx(path):
	#将path下所有的doc转换为有效的docx
    path_list = os.listdir(path)
    doc_list = [os.path.join(path, str(i))
                for i in path_list if str(i).endswith('doc')]
    word = wc.Dispatch('Word.Application')
    for path in doc_list:
        print(path)
        save_path = str(path).replace('doc', 'docx')
        doc = word.Documents.Open(path)
        doc.SaveAs(save_path, 12, False, "", True,
                   "", False, False, False, False)
        doc.Close()
        print('{} Save sucessfully '.format(save_path))
    word.Quit()


def docx2dataframe(filepath) -> pd.DataFrame:
	# 将一个docx的文件路径传入,发挥一个Dataframe,便于导出
    doc = docx.Document(filepath)
    # print(len(doc.tables)) # 检查表格数量是否在一个合理范围
    for index, table in enumerate(doc.tables):
        df = [['' for i in range(len(table.columns))]
              for j in range(len(table.rows))]
        try:
            for i, row in enumerate(table.rows):
                for j, cell in enumerate(row.cells):
                    if cell.text:
                        df[i][j] = cell.text
            return pd.DataFrame(df)
        except:
            pass  # 出错的是少量,手动添加即可


if __name__ == "__main__":
    convertdoc_docx(word_paths)  # 这个函数调用将当前文件夹下所有的doc文件转为docx

    excel_writer = pd.ExcelWriter('target.xlsx')  # 创建目标excel文件
    docx_list = [os.path.join(word_paths, i) for i in os.listdir(
        word_paths) if str(i).endswith('.docx')]  # docx文件路径列表
    for index, docx_file in enumerate(docx_list):
        docx2dataframe(docx_file).to_excel(
            excel_writer=excel_writer, sheet_name=f'sheet{index}', index=False)
        excel_writer.save() # 每次读完保存一下
        print('\r' + str(index), end='') # 查看进度

    excel_writer.close()

版权声明:本文为weixin_45502929原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。