暂时记录,改天再整理
import re import os import pandas as pd from requests import get from docx import Document import win32com.client as win import subprocess # dir2 = 'C:/Users/User/Documents/gzzw/' # names = os.listdir(dir2) # data = {} # word = win.Dispatch('Word.Application') # for (i, n) in enumerate(names): # subprocess.call(['soffice', '--headless', '--convert-to', 'docx', '--outdir', dir2 + str(i) + '.docx', dir + n]) # doc = Document(docx=dir2 + n) # table = doc.tables[1] # row = table.rows[0] # row1 = table.rows[1] # for k, v in zip(row.cells, row1.cells): # if i == 0: # data[k.text] = [v.text] # else: # data[k.text].append(v.text) # gs = re.match(r'.*_(.*)_.*', n) # dn.append(gs.group(1)) # os.rename(dir + n, dir + str(i) + '.doc') # doc = word.Documents.Open(dir + n) # doc.SaveAs(dir2 + str(i) + '.docx', FileFormat=12) # table = doc.Tables(2) # for j in range(table.Columns.Count): # print(table.Cell(Row=1, Column=i + 1).Range.Text) # label.append(table.Cell(Row=1, Column=i + 1).Range.Text.encode('utf8')) # dn.append(table.Cell(Row=2, Column=i + 1).Range.Text.encode('utf8')) # word.Quit() # sheel = pd.DataFrame(data) # sheel.to_excel(dir2 + 'statics.xlsx', index=False, encoding='utf8')
一些参考链接:
1.https://code.activestate.com/recipes/279003-converting-word-documents-to-text/
2.https://*.com/questions/1468099/python-win32-extensions-documentation
3.https://*.com/questions/10366596/how-to-read-contents-of-an-table-in-ms-word-file-using-python
4.https://*.com/questions/38468442/multiple-doc-to-docx-file-conversion-using-python
5.https://www.jianshu.com/p/4fa504c720c1