-
Notifications
You must be signed in to change notification settings - Fork 0
/
alt_models_summarize.py
193 lines (167 loc) · 9.83 KB
/
alt_models_summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python3
# Change log
# * v1.0.1 2024-04-12: Fixed issue with output writing; removed trailing spaces and replaced tabs with spaces. Added phenotype column to output files.
# * v1.0.0 2024-04-12: Initial version.
# Version information
VERSION_NAME = 'Summarize AltModels'
VERSION = '1.0.1'
VERSION_DATE = '2024-04-12'
COPYRIGHT = 'Copyright 1979-2024. Sander W. van der Laan | s.w.vanderlaan [at] gmail [dot] com | https://vanderlaanand.science.'
COPYRIGHT_TEXT = f'\nThe MIT License (MIT). \n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and \nassociated documentation files (the "Software"), to deal in the Software without restriction, \nincluding without limitation the rights to use, copy, modify, merge, publish, distribute, \nsublicense, and/or sell copies of the Software, and to permit persons to whom the Software is \nfurnished to do so, subject to the following conditions: \n\nThe above copyright notice and this permission notice shall be included in all copies \nor substantial portions of the Software. \n\nTHE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, \nINCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR \nPURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS \nBE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, \nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE \nOR OTHER DEALINGS IN THE SOFTWARE. \n\nReference: http://opensource.org.'
# Import modules
import argparse
from argparse import RawTextHelpFormatter
import os
import time
from datetime import datetime
from datetime import timedelta
import shutil
import tarfile
# Define default values
default_output_dir = os.getcwd()
# phenotypes_logistic = ["Calcification", "Collagen", "Fat10", "Fat40", "IPH"]
# phenotypes_linear = ["SMC_rankNorm", "MAC_rankNorm", "Neutrophils_rankNorm", "MastCells_rankNorm", "VesselDensity_rankNorm", "Plaque_Vulnerability_Index"]
# Function to concatenate logistic regression results
def concatenate_logistic_results(snp_format, phenotypes, output_dir, verbose):
# Create the output file
output_file = os.path.join(output_dir, f"logistic.chr{snp_format}.results.txt")
if verbose:
print(f"- creating summary results file [{output_file}], and adding results for...")
with open(output_file, "w") as f:
f.write("Phenotype CHROM POS ID REF ALT PROVISIONAL_REF? A1 OMITTED A1_FREQ Firth? TEST OBS_CT OR LOG(OR)_SE Z_STAT P ERRCODE\n")
for p in phenotypes:
if verbose:
print(f"- {p}")
file_name = f"logistic_chr{snp_format}_{p}.{p}.glm.logistic.hybrid"
with open(file_name) as infile:
next(infile) # Skip header
for line in infile:
# Replace tabs with spaces
line = p + " " + line.strip().replace("\t", " ")
f.write(line + "\n")
# Function to concatenate linear regression results
def concatenate_linear_results(snp_format, phenotypes, output_dir, verbose):
# Create the output file
output_file = os.path.join(output_dir, f"linear.chr{snp_format}.results.txt")
if verbose:
print(f"- creating summary results file [{output_file}], and adding results for...")
with open(output_file, "w") as f:
f.write("Phenotype CHROM POS ID REF ALT PROVISIONAL_REF? A1 OMITTED A1_FREQ TEST OBS_CT BETA SE T_STAT P ERRCODE\n")
for p in phenotypes:
if verbose:
print(f"- {p}")
file_name = f"linear_chr{snp_format}_{p}.{p}.glm.linear"
with open(file_name) as infile:
next(infile) # Skip header
for line in infile:
# Replace tabs with spaces
line = p + " " + line.strip().replace("\t", " ")
f.write(line + "\n")
# Function to create tar.gz files
def create_tar_file(file_name, raw_result, raw_result_log, output_dir, verbose=False):
if verbose:
print(f"- processing {raw_result} and {raw_result_log}")
tar_file_path = os.path.join(output_dir, f"{file_name}.tar.gz")
with tarfile.open(tar_file_path, "w:gz") as tar:
tar.add(raw_result)
tar.add(raw_result_log)
# Remove the original files
if verbose:
print(f"- archived raw results; removing input files")
os.remove(raw_result)
os.remove(raw_result_log)
# Define main function
def main():
parser = argparse.ArgumentParser(description=f'''
+ {VERSION_NAME} v{VERSION} +
This script will concatenate the results of the logistic or linear regression models (`--glm`)
for a given SNP (`--snp`) and a list of phenotypes (`--phenotypes`). The results will be
written to a single file in the output directory (`--output_dir`); by default the current
working directory. The script will also create `.tar.gz` files for the raw results of the
logistic or linear regression models. The raw results are the files with the extension
`.glm.logistic.hybrid` or `.glm.linear` and the log files with the extension `.log`
Use the `--verbose` flag to print extra information.
To get more information on the script, please use the `--help` flag. The `--version` flag will provide
you with the version information.
Example usage:
python alt_models_summarize.py --snp 16:12345 --glm --phenotypes Calcification Collagen Fat10 Fat40 IPH --output_dir results
''',
epilog=f'''
+ {VERSION_NAME} v{VERSION}. {COPYRIGHT} \n{COPYRIGHT_TEXT}+''',
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--snp", "-s", help="Specify the SNP in the format 16:12345")
parser.add_argument("--glm", "-g", choices=["logistic", "linear"], help="Concatenate linear regression results.")
parser.add_argument("--phenotypes", "-p", nargs="+", help="List of phenotypes. For example: 'phenotype1 phenotype2 phenotype3'.")
parser.add_argument("--output_dir", "-o", default=default_output_dir, help="Output directory for results")
parser.add_argument('--verbose', '-v', action='store_true', help='Print extra information. Optional.')
parser.add_argument('--version', '-V', action='version', version=f'%(prog)s {VERSION} ({VERSION_DATE}).')
args = parser.parse_args()
# Start the script
print(f"+ {VERSION_NAME} v{VERSION} ({VERSION_DATE}) +")
print(f"\nSummarizing alternative models on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")
# Check if required arguments are provided
required_args = ['snp', 'glm', 'phenotypes']
missing_args = [arg for arg in required_args if not getattr(args, arg)]
if missing_args:
print(f"ERROR. The following required arguments are missing, but are required: {', '.join(missing_args)}")
parser.print_help()
exit(1)
# Start the timer
start_time = time.time()
# Get today's date
today_date = datetime.now()
# Format the date as yyyymmdd
formatted_today = today_date.strftime("%Y%m%d")
# Convert SNP format to chr16-12345
snp_format = args.snp.replace(":", "-")
# Check if output directory exists
if not os.path.exists(args.output_dir):
print(f"Output directory '{args.output_dir}' does not exist. Making it for you.")
os.makedirs(args.output_dir)
if args.verbose:
print(f"Output directory is '{args.output_dir}'.")
# Concatenate logistic regression results
if args.glm == "logistic":
if args.verbose:
print(f"Summarizing logistic regression results for SNP {snp_format} and phenotypes {args.phenotypes}.")
concatenate_logistic_results(snp_format, args.phenotypes, args.output_dir, args.verbose)
if args.verbose:
print(f"Creating tar.gz files for logistic regression results.")
for p in args.phenotypes:
if args.verbose:
print(f"Adding raw results to logistic_chr{snp_format}.{p}.tar.gz")
create_tar_file(f"logistic_chr{snp_format}_{p}", f"logistic_chr{snp_format}_{p}.{p}.glm.logistic.hybrid", f"logistic_chr{snp_format}_{p}.log", args.output_dir, args.verbose)
# Concatenate linear regression results
elif args.glm == "linear":
if args.verbose:
print(f"Summarizing linear regression results for SNP {snp_format} and phenotypes {args.phenotypes}.")
concatenate_linear_results(snp_format, args.phenotypes, args.output_dir, args.verbose)
if args.verbose:
print(f"Creating tar.gz files for linear regression results.")
for p in args.phenotypes:
if args.verbose:
print(f"Adding raw results to linear_chr{snp_format}.{p}.tar.gz")
create_tar_file(f"linear_chr{snp_format}_{p}", f"linear_chr{snp_format}_{p}.{p}.glm.linear", f"linear_chr{snp_format}_{p}.log", args.output_dir, args.verbose)
else:
print(f"ERROR. The --glm argument should be either 'logistic' or 'linear'.")
parser.print_help()
exit(1)
# Calculate the elapsed time in seconds
elapsed_time = time.time() - start_time
# Convert seconds to a timedelta object
time_delta = timedelta(seconds=elapsed_time)
# Extract hours, minutes, seconds, and milliseconds
hours, remainder = divmod(time_delta.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
milliseconds = round(time_delta.microseconds / 1000)
# Print the script execution time in the desired format
formatted_time = f"{hours} hours, {minutes} minutes, {seconds} seconds, {milliseconds} milliseconds"
# Print the total execution time
print(f"\nScript executed on {today_date.strftime('%Y-%m-%d')}. Total execution time was {formatted_time} (minus writing time).")
# Run the main function
if __name__ == "__main__":
main()
# Print the version number
print(f"\n+ {VERSION_NAME} v{VERSION} ({VERSION_DATE}). {COPYRIGHT} +")
print(f"{COPYRIGHT_TEXT}")
# End of file