-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnew_procesing.py
151 lines (131 loc) · 7 KB
/
new_procesing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pdfplumber,os,glob,re,pathlib,datetime,string,shutil,time
import pandas as pd
from itertools import chain
t0 = time.time()
print('Start Move All'.center(30,'*'))
list_tags = dict(
# 以下类目有先后顺序
油费 = '汽油|柴油|加油',
交通 = '客运|交通|地铁|滴滴|出行|摩拜|骑安|客运服务',
差旅 = '差旅|酒店|住宿|寄存|旅游|旅店|宾馆|旅行',
文娱 = '广播影视服务|文化|电影|影视|娱乐',
服饰 = '纺织产品|服装|服饰|衣服|衫|裤|裙|袜|鞋|饰品|运动服|盖璞|迅销|飒拉',
办公 = '纸制品|印刷|文具|打印纸|笔|文件夹|胶水|回形针|剪刀|纸刀|订书|书桌垫',
数码设备 = '电线电缆|配电控制设备|移动通信设备|数码|设备|电脑|手机|麦克风|耳机|相机|USB|转换器|插座|路由|显示器|键盘|鼠标|灯',
家具 = '家具|照明装置',
家电 = '非电力家用器具',
医疗 = '医疗服务|宠物|美容|护肤|自疗|理疗|健康',
教育服务 = '教育服务|培训|考试|报名|课程',
技术服务 = '信息技术服务|增值服务',
餐饮 = '餐饮服务',
食品 = '植物油|茶|饲料|蔬菜|水果|饮料|食品|糖|糖果|焙烤食品|肉及肉制品|方便食品|谷物|调味品|乳制品|水产加工品|果类加工品|营养保健食品|酒|海水产品|加工盐|熟肉制品',
物业管理 = '售电|管理费|物业管理|租赁|保洁|垃圾费|租金|供电|水冰雪',
运输服务 = '运输服务',
物流服务 = '物流',
现代服务 = '设计服务|现代服务',
生活百货 = '预付|美容护肤品|日用杂品|其他化学制品|洗涤剂',
经营 = '无形资产|服务费|体育用品|轴承|五金|包装',
详见销货清单 = '详见销货清单'
)
zhon_pun = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
en_pun = f'{(string.whitespace+string.punctuation)}:;'
PAT1 = rf'[{zhon_pun+en_pun}]'
# Default return Catalog will be '其他'
def catalogf(content):
keys = re.findall(rf'\*[{zhon_pun}\u4e00-\u9fa5\s\n]+\*', content)
ll = [len([x for x in keys if re.search(tag,x)]) for tag in list_tags.values()]
catalog = '其他'
if max(ll) > 0:
catalog = list(list_tags.keys())[ll.index(max(ll))]
return catalog
# 读取的内容过短不够字符数的合并到下一行
def list_combine(list1):
it = iter(list1)
for x in it:
x = x.translate({ord(c): None for c in '\n'})
# print(x)
if len(x) < 6:
try:
x += ' ' + next(it)
except StopIteration:
pass
yield x
#表头格式
RECORD_HEADER=['序号','发票号码','卖家','卖家号码','买家','买家号码','开票日期','月份','金额','类别','内容']
# 建立记录到指定目录
FILE_FOLDER=r'D:\xx'
RECORD_PATH = os.path.join(FILE_FOLDER, 'iv_records.xlsx')
if not os.path.exists(RECORD_PATH):
df = pd.DataFrame(columns=RECORD_HEADER)
else:
df = pd.read_excel(RECORD_PATH,dtype=object,na_filter=False)
df1 = df.copy()
# 索取目录下所有pdf文件
p4 = [str(p1) for p1 in pathlib.Path(FILE_FOLDER).glob('*') if str(p1).endswith('.pdf')]
number1 = 1
for file_path in p4: #[:10]
# print(file_path)
try:
with pdfplumber.open(file_path) as pdf:
t1 = pdf.pages[0].extract_text_simple(x_tolerance=4, y_tolerance=3)
# datestr1 = re.search(r'(?<=%s)\d{4}年\d{2}月\d{2}日|$'%re.search(r'开票日期[\s::]|$',t1).group().strip(),t1).group().strip()
datestr1 = re.search(r'(?<=%s).*|$'%re.search(r'开票日期[\s::]|$',t1).group().strip(),t1).group().strip()
datestr1 = re.sub(PAT1,'', datestr1)
date2 = datetime.datetime.strptime(datestr1, '%Y年%m月%d日').strftime('%Y年%m月')
Invoicecode = re.search(r'(?<=%s).*|$'%re.search(r'发票号码[\s::]|$',t1).group().strip(),t1).group().strip()
Invoicecode = re.sub(PAT1,'', Invoicecode)
t2 = pdf.pages[0].extract_table(table_settings={"text_tolerance": 4})
t5 = list(list_combine(filter(None, chain(*t2))))
t4 = t5.copy()
for i,x in enumerate(t4):
if '购买' in x:
Buyer = re.search(r'(?<=%s)\s?[\u4e00-\u9fa5()()]+'%re.search(r'称\s*[\s::]|$',x).group().strip(), x).group().strip()
Buyercode = re.sub(PAT1,'',re.search(r'(?<=%s).*|$'%re.search(r'纳税.{1}识别号|$',x).group().strip(),x).group())
t4.pop(i)
break
for i,x in enumerate(t4):
if '销售' in x:
Seller = re.search(r'(?<=%s)\s?[\u4e00-\u9fa5()()]+'%re.search(r'称\s*[\s::]|$',x).group().strip(), x).group().strip()
Sellercode = re.sub(PAT1,'',re.search(r'(?<=%s).*|$'%re.search(r'纳税.{1}识别号|$',x).group().strip(),x).group())
t4.pop(i)
break
for i,x in enumerate(t4):
if '小写' in x:
Price1 = re.search(r'(?<=[¥¥]).*|$', re.search(r'(?<=小写).*|$', x).group().strip()).group().strip()
# money2 = x[x.index]
t4.pop(i)
break
if len(pdf.pages) > 1:
# 有大于一页发票
content1 = pdf.pages[1].extract_text_simple(x_tolerance=4, y_tolerance=3)
else:
for i,x in enumerate(t4):
if '名称' in x and '*' in x:
content1 = x
t4.pop(i)
break
catalog1 = catalogf(content1)
row1 = [number1,Invoicecode,Seller,Sellercode,Buyer,Buyercode,datestr1,date2,Price1,catalog1,content1]
df.loc[len(df),:] = row1
number1 +=1
# 1 生成文件夹
FolderDate = datetime.datetime.strptime(datestr1, '%Y年%m月%d日').strftime('%Y %m月')
folder4 = os.path.join(FILE_FOLDER,Buyer,FolderDate)
# print(folder4)
os.makedirs(folder4,exist_ok=1)
# 2 改名 Move the file into folder
fname = f'{Seller}-{catalog1}-{Price1}-{datestr1}-{Invoicecode}.pdf'
new_file = os.path.join(folder4,fname)
shutil.move(file_path,new_file)
print(new_file)
except Exception as e:
print(f'Error occured !',e)
df.drop_duplicates(['发票号码'],keep='last',inplace=True)
df = df.reset_index(drop=True)
df['发票号码'] = df['发票号码'].apply(str)
df['金额'] = df['金额'].apply(float)
if not df1.iloc[:,1:].equals(df.iloc[:,1:]):
# df.to_csv(RECORD_PATH,encoding='utf-8-sig',index=0)
df.to_excel(RECORD_PATH,index=0) # encoding='utf-8-sig'
os.system('start excel %s'%RECORD_PATH)
print(('End Move All time: %.3f'%(time.time() - t0)).center(30,'*'))