-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_02_excel_in_docx_statistic.py
100 lines (77 loc) · 3.19 KB
/
_02_excel_in_docx_statistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# pip install python-docx
# pip install numpy
# pip install pandas
# 只能针对单一类型的文件,几乎没有泛化性
import os
import docx
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import sys
source_docxs = []
# In[2]指定源docx文档目录
file_path = r"D:\_PyCharmWorkSpace\_homework_interesting\1717测绘毕业要求评价表_source_2"
for file in os.listdir(file_path):
if file.endswith(".docx") or file.endswith(".doc"):
source_docxs.append(file_path + "\\" + file)
file = source_docxs[0]
docx_file = docx.Document(file)
row_str = []
for table in docx_file.tables:
for row in table.rows:
row_str.append([cell.text for cell in row.cells])
row_str = np.array(row_str)
# 这里应当是需要修改的,2:14是在我知道行列的情况下
# 实际应使用的是row_str[0], row_str[1]来代替14和6 #这个已更正好了
# 2和1不需要代替,是因为需要指定哪几行是行标题和列标题
frame = DataFrame(row_str)
# frame.iloc[2:14,1:6]=0
frame.iloc[2:row_str.shape[0],1:row_str.shape[1]]=0
for i in range(len(source_docxs)):
file = source_docxs[i]
docx_file = docx.Document(file)
docx_table = docx_file.tables[0]
for table in docx_file.tables:
for i in range(2, row_str.shape[0]):
for j in range(1, row_str.shape[1]):
table.cell(i,j).text = table.cell(i,j).text + "\\"
for table in docx_file.tables:
for i in range(2, row_str.shape[0]):
for j in range(1, row_str.shape[1]):
if str(table.cell(i,j).text) == "\\":
table.cell(i,j).text = "0"
else:
table.cell(i,j).text = "1"
row_str = []
for table in docx_file.tables:
for row in table.rows:
row_str.append([cell.text for cell in row.cells])
row_str = np.array(row_str)
# print("rows:"+str(row_str.shape[0]))
# print("columns:"+str(row_str.shape[1]))
row_str_pd = pd.DataFrame()
row_str_pd = pd.DataFrame(row_str)
for i in range(1,row_str.shape[1]):
row_str_pd.iloc[2:row_str.shape[0], i] = pd.to_numeric(row_str_pd.iloc[2:row_str.shape[0], i])
frame.iloc[2:row_str.shape[0],1:row_str.shape[1]] += row_str_pd.iloc[2:row_str.shape[0],1:row_str.shape[1]]
sys.stdout.write("\r"+ "文件编号"+ str(i) + file)
sys.stdout.flush()
new_docx = docx.Document()
new_docx_table = new_docx.add_table(rows = row_str.shape[0], cols = row_str.shape[1])
for i in range(2, row_str.shape[0]):
for j in range(1, row_str.shape[1]):
new_docx_table.cell(i,j).text = "0"
for table in new_docx.tables:
# 有几行列标题,应手动输入
for i in range(2):
for j in range(row_str.shape[1]):
table.cell(i,j).text = row_str[i,j]
for table in new_docx.tables:
# 从第几行开始设置行标题,应手动输入
for i in range(2,row_str.shape[0]):
for j in range(1):
table.cell(i,j).text = row_str[i,j]
for i in range(2, row_str.shape[0]):
for j in range(1, row_str.shape[1]):
new_docx_table.cell(i,j).text = str(frame.iloc[i,j])
new_docx.save("new_docx.docx")