-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_run_download.py
54 lines (38 loc) · 1.48 KB
/
01_run_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
'''
The purpose of this file is to download the zipped files containing
the PubMed publications (i.e. documents). These will be mined later.
'''
import os, json, sys
from caseolap._01_download import *
'''
Parameters
'''
# Input
data_dir = './'
logFilePath = './log/download_log.txt'
download_config_file_path = './config/download_config.json'
ftp_config_file_path = './config/ftp_config.json'
baseline_dir = os.path.join(data_dir, 'ftp.ncbi.nlm.nih.gov/pubmed/baseline/')
update_files_dir = os.path.join(data_dir,'ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/')
'''
Main Code
'''
# Start the download, verification, and extraction process
if __name__ == '__main__':
# Open log file
logfile = open(logFilePath, 'w')
# Load config files
download_config = json.load(open(download_config_file_path))
ftp_config = json.load(open(ftp_config_file_path))
# Check main directory
if not os.path.isdir(data_dir):
print('Directory not found:', data_dir)
# Start download
download_pubmed(data_dir, download_config, ftp_config, logfile)
# Verify download: 'baseline files' & 'update files'
check_all_md5_in_dir(baseline_dir, logfile, linux = True, mac = False)
check_all_md5_in_dir(update_files_dir, logfile, linux = True, mac = False)
# Extract downloaded files: 'baseline files' & 'update files'
extract_all_gz_in_dir(baseline_dir, logfile)
extract_all_gz_in_dir(update_files_dir, logfile)
logfile.close()