-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdkbscraper.py
180 lines (131 loc) · 4.3 KB
/
dkbscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python
# encoding: utf-8
# This file is in the Public Domain as specified by
# http://creativecommons.org/publicdomain/zero/1.0/
import requests
import lxml.html
import getpass
import bs4
import collections
import tempfile
import shutil
import os
import os.path
PostboxDocument = collections.namedtuple(
'PostboxDocument',
[
'title',
'is_read',
'url',
'filename',
]
)
class DKBSession(object):
"""
DKB Session
Usage
-----
>>> import readline
>>> dkbs = dkbscraper.DKBSession()
>>> dkbs.login(input("Username: "))
>>> documents = list(dkbs.postbox_items())
>>> dkbs.download_document(documents[10], ['test.pdf'])
>>> dkbs.logout()
"""
base_url = 'https://banking.dkb.de'
login_url = '/dkb/-?$javascript=disabled'
def __init__(self, verbose=True):
# Initialize HTTP session
self.s = requests.Session()
self.verbose = verbose
def login(self, username):
"""
Login to DKB Online Banking
"""
if self.verbose:
print('Login to DKB Online Banking')
# Get DKB Banking login page
r = self.s.get(self.base_url + self.login_url)
login_page = lxml.html.fromstring(r.text)
# Get DKB Banking login form
login_form = login_page.forms[0]
# Fill in username and password
login_form.fields['j_username'] = username
login_form.fields['j_password'] = getpass.getpass()
# Post login
r = self.s.post(
self.base_url + login_form.action,
data=dict(login_form.fields)
)
# Parse returned page
soup = bs4.BeautifulSoup(r.text)
if not soup.find(text='Finanzstatus'):
raise RuntimeError('Login to DKB Online Banking failed.')
self.logout_url = soup.find('a', id='logout')['href']
self.postbox_url = soup.find(id="valueOut")['href']
if self.verbose:
print('Logged in to DKB Online Banking')
return True
def logout(self):
"""
Logout from DKB Online Banking
"""
if self.verbose:
print('Log out from DKB Online Banking')
r = self.s.get(self.base_url + self.logout_url)
ret = r.status_code == 200
self.s.close()
return ret
def postbox_items(self):
"""
Iterate over the postbox items
"""
# Get Postbox page
r = self.s.get(self.base_url + self.postbox_url)
# Parse Postbox page
soup = bs4.BeautifulSoup(r.text)
# Get document table
table = soup.find(id="documentsTableOverview_outer")
# Iterate through the documents
for row in table.table.tbody.findChildren(name='tr'):
# document read?
is_read = not row.findChild(id='title').findChild('strong')
# document title
title = row.findChild(id='title').a.text
# document download link
url = row.find(title='Speichern').find_parent().get('href')
# filename
filename = url.split('/')[-1].split('?')[0] + '.pdf'
yield PostboxDocument(
is_read=is_read,
title=title,
url=url,
filename=filename,
)
def download_document(self, document, destinations):
"""
Download a document
"""
url = self.base_url + document.url
# download document
if self.verbose:
print("Download document '{}'".format(document.title))
r = self.s.get(url, stream=True)
if not r.status_code == 200:
raise RuntimeError('Download failed.')
r.raw.decode_content = True
# copy http data to temporary file
with tempfile.NamedTemporaryFile(delete=False) as fp:
shutil.copyfileobj(r.raw, fp)
tmp_filename = fp.name
# copy file to destinations
for dest in destinations:
if self.verbose:
print("Copy to {}".format(dest))
if os.path.exists(dest):
print('"{}" already exists'.format(dest))
continue
shutil.copyfile(tmp_filename, dest)
# delete temporary file
os.unlink(tmp_filename)
return True