-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdown-from-dllfiles.py
59 lines (46 loc) · 1.8 KB
/
down-from-dllfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from lxml import html
import requests
import re
from six.moves import urllib
def maybe_download(source, work_directory):
m = re.search(r'([^/]+)\?+?', source)
print ('Downloading file %s' % m.group(1))
filepath = '%s/%s'%(work_directory, m.group(1))
filepath, _ = urllib.request.urlretrieve(source, filepath)
return filepath
def get_link(source):
page = requests.get(source)
tree = html.fromstring(page.content)
dlink = u''
if tree.xpath('//div[@class="file_download"]'):
dlink = tree.xpath('//div[@class="file_download"]')[0].xpath('a/@href')[0]
return dlink
def get_dll_links_from_alink(source, outfile):
page = requests.get(source)
tree = html.fromstring(page.content)
f = open(outfile, 'a')
for sel in tree.xpath('//div[@class="content"]/ul/li'):
link = sel.xpath('a/@href')[0]
f.write('%s\n'%link)
f.close()
def get_dll_links_from_dllfiles(outfile):
import string
source = u'http://www.dll-files.com/dllindex/index-0-9.shtml'
outfile = '/home/dien/workspace/virus-detection/data/dll-link.txt'
get_dll_links_from_alink(source, outfile)
for c in string.lowercase:
url = "http://www.dll-files.com/dllindex/index-%c.shtml"%c
get_dll_links_from_alink(url, outfile)
def main():
filepath='/home/dien/workspace/virus-detection/data/dll-link.txt'
print ('Getting links from website: dll-files.com...'),
# get_dll_links_from_dllfiles(filepath)
print ('Done!')
with open(filepath, 'r') as f:
for post_url in f:
link = u'http://www.dll-files.com%s'%post_url.strip()
download_url = get_link(link)
if download_url:
maybe_download(download_url, '/home/dien/workspace/virus-detection/data/dll-files')
if __name__ == '__main__':
main()