Skip to content

Commit

Permalink
Added additional pre-computed maps with relaxed intra- and inter- blo…
Browse files Browse the repository at this point in the history
…ck correlation thresholds. Bug fix for log file creation. Bug fixes for argument error parsing and checking. Added parameters for intermediate file storage directory and output file directory.
  • Loading branch information
naumanjaved committed Aug 30, 2020
1 parent 83436f8 commit 852db24
Show file tree
Hide file tree
Showing 16 changed files with 960,203 additions and 198 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ git clone https://github.com/naumanjaved/fingerprint_maps.git
```

## Precomputed map files

* [hg19 with "chr" prefix](https://github.com/naumanjaved/fingerprint_maps/blob/master/map_files/hg19_chr.map)
* [hg19 without "chr" prefix](https://github.com/naumanjaved/fingerprint_maps/blob/master/map_files/hg19_nochr.map)
* [hg38 with "chr" prefix](https://github.com/naumanjaved/fingerprint_maps/blob/master/map_files/hg38_chr.map)
* [hg38 without "chr" prefix](https://github.com/naumanjaved/fingerprint_maps/blob/master/map_files/hg38_nochr.map)

The map_files directory also contains pre-computed maps with relaxed intra- and inter- block correlation thresholds. Map names contain the parameters used.

## Dependencies

Expand All @@ -31,12 +33,12 @@ In order to run `build_fingerprint_maps`, you must have working installations of
g. `traceback`
h. `time`
i. `datetime`

4. [LDSC(LDScore regression)](https://github.com/bulik/ldsc)

## Required Files
Fingerprint maps uses VCFs from 1000 Genomes Phase 3 and recombination maps(SHAPEIT format). These can be found here:
* [hg19 VCFs](ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/)
* [hg38 VCFs(liftover)](ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/)
* [hg19 and hg38 liftover VCFs](https://www.internationalgenome.org/data/)
* [hg19 recombination maps from SHAPEIT](references/genetic_map_b37.tar.gz)
* [hg38 recombination maps from SHAPEIT liftover](references/genetic_map_hg38.tar.gz)

Expand Down
218 changes: 125 additions & 93 deletions build_fingerprint_maps.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Created on Tue Nov 21 15:12:16 2017
@author: Nauman Javed
"""
from __future__ import division
import map_building_functions as build
Expand All @@ -15,7 +15,7 @@
import time
import datetime

__version__ = '1.0.17'
__version__ = '1.0.1'
MASTHEAD = "---------------------------------------------------------------\n"
MASTHEAD += "* Fingerprint_Maps\n"
MASTHEAD += "* Version {V}\n".format(V=__version__)
Expand All @@ -24,7 +24,6 @@
class Logger(object):
'''
Lightweight logging.
TODO: replace with logging module
'''
def __init__(self, fh):
self.log_fh = open(fh, 'wb')
Expand All @@ -37,72 +36,79 @@ def log(self, msg):
print msg

def create_maps(args, log):
intermediate_directory = args.cwd + "intermediates/"
log.log('Pre-processing VCFs.')
build.filter_VCFs(args.chromosome, args.VCF_file, intermediate_directory, args.min_MAF)



log.log('Pre-processing VCFs.')
build.filter_VCFs(args.chrom, args.VCF_file, int_dir, args.min_MAF)
log.log('Finished processing VCFs.')



log.log('Creating list of SNPs with similar MAFs across populations...')
build.extract_similar_SNPs(args.chromosome,
intermediate_directory, args.similarity)
log.log('Finished creating lists of similar SNPs...')
build.extract_similar_SNPs(args.chrom,
int_dir, args.similarity, args.min_MAF)
log.log('Finished creating lists of similar SNPs...')

log.log('Creating VCFs with common variants')
build.keep_common_SNPs(args.chromosome, intermediate_directory)
build.keep_common_SNPs(args.chrom, int_dir)
log.log('Finished creating VCFs with common variants')

log.log('Creating PLINK binary files...')
build.create_PLINK_binary(args.chromosome, intermediate_directory,
args.recomb_directory)
build.create_PLINK_binary(str(args.chrom), int_dir,
args.recomb_file)
log.log('Finished creating binary files.')

log.log('Calculating LDScores...')
build.LD_score(args.chromosome, intermediate_directory, args.LD_script,
args.LDScore_window_autosome)
build.LD_score(str(args.chrom), int_dir, args.LD_script,
args.LDScore_window_autosome)
log.log('Finished calculating LDScores.')

log.log('Creating PLINK association files...')
build.order(args.chromosome, intermediate_directory)
build.order(args.chrom, int_dir)
log.log('Finished creating PLINK association files.')

log.log('Pruning SNPs...')
build.prune(args.chromosome, intermediate_directory,
args.prune_window, args.prune_slide, args.prune_cutoff)
build.prune(args.chrom, int_dir,
int(args.prune_window), args.prune_slide, args.prune_cutoff)
log.log('Finished pruning SNPs.')

log.log('Separating out LDScores into dependent and independent SNP files...')
build.LD_separate(args.chromosome, intermediate_directory)
build.LD_separate(args.chrom, int_dir)
log.log('Finished separating LDScores.')

log.log('Clumping SNPs...')
build.clump(args.chromosome, intermediate_directory,
build.clump(args.chrom, int_dir,
args.clump_cutoff, args.max_distance_clump)
log.log('Finished clumping SNPs.')

log.log('Building map file...')
build.reformat_clumps(args.chromosome, intermediate_directory)
build.reformat_clumps(args.chrom, int_dir)
log.log('Finished building map files.')

log.log('Detecting negative LD...')
build.detect_negative_LD(args.chromosome, intermediate_directory)
build.detect_negative_LD(args.chrom, int_dir)
log.log('Finished recording negative LD.')

log.log('Switching alleles...')
build.switch_alleles(args.chromosome, args.cwd, intermediate_directory)
build.switch_alleles(args.chrom, int_dir, out_dir)
log.log('Finished switching negative LD alleles.')



# Argument parsing
parser = argparse.ArgumentParser()
# Directory specifications'
parser.add_argument('--recomb_directory', default=None, type=str,
help='Directory pointing to where shapeit recombination files are located.')
parser.add_argument('--chromosome', default=None, type=str,

# Directory and reference file specifications'
parser.add_argument('--int_dir', default=None, type=str,
help='directory to store intermediate files')
parser.add_argument('--out_dir', default=None, type=str,
help='directory to store output files')
parser.add_argument('--recomb_file', default=None, type=str,
help='Shapeit recombination file')
parser.add_argument('--chrom', default=None, type=str,
help='Chromosome for which to calculate.')
parser.add_argument('--VCF_file', default=None, type=str,
help='VCF file name for the selected chromosome')
parser.add_argument('--LD_script', default=None, type=str,
help='Directory containing ldsc.py ')
parser.add_argument('--cwd', type=str, help='current working directory')

# SNP filtering and map calculation parameters
parser.add_argument('--similarity', default=0.10, type=float,
Expand All @@ -115,11 +121,11 @@ def create_maps(args, log):
help='Window size in kb for PLINK prune function')
parser.add_argument('--prune_slide', default=5, type=int,
help='Number of SNPs to slide over on each iteration of PLINK prune')
parser.add_argument('--prune_cutoff', default=0.1, type=float,
parser.add_argument('--prune_cutoff', default=0.10, type=float,
help='Maximum r^2 correlation allowed between pruned SNPs')
parser.add_argument('--clump_cutoff', default=0.9, type=float,
parser.add_argument('--clump_cutoff', default=0.85, type=float,
help='Minimum r^2 correlation required for SNPs to be clumped together')
parser.add_argument('--max_distance_clump', default=10000, type=float,
parser.add_argument('--max_distance_clump', default=10000, type=int,
help='Maximum distance in kb a SNP can be from index SNP when forming clump')

if __name__ == "__main__":
Expand All @@ -128,75 +134,101 @@ def create_maps(args, log):
'9', '10', '11', '12', '13', '14', '15', '16',
'17', '18', '19', '20', '21', '22', 'X']

if not isinstance(args.recomb_directory, str):
raise TypeError('--recomb_directory must be string pointing to where recomb files are stored')
if args.recomb_directory is None:
raise ValueError('--recomb_directory is required.')
# argument error handling

if args.int_dir is None:
raise ValueError('--int_dir is required.')

if args.out_dir is None:
raise ValueError('--out_dir is required.')

if args.recomb_file is None:
raise ValueError('--recomb_file is required.')
if not os.path.isfile(args.recomb_file):
raise ValueError('--recomb_file not found.')

if not isinstance(args.chromosome, str):
raise TypeError('--chromosome must be a string 1-22 or X')
if args.chromosome is None or args.chromosome not in accepted_chromosomes:
raise ValueError('--chromosome is required.')
if args.chrom is None :
raise ValueError('--chrom is required.')
if args.chrom not in accepted_chromosomes:
raise ValueError('--chrom must be between 1-22 or X.')

if not isinstance(args.VCF_file, str):
raise TypeError('--VCF_file must be full path pointing to VCF file')
if args.VCF_file is None:
raise ValueError('--VCF_file_name is required.')
raise ValueError('--VCF_file is required.')
if not os.path.isfile(args.VCF_file):
raise ValueError('--VCF_file not found.')

if not isinstance(args.LD_script, str):
raise TypeError('--LD_script must be full path pointing to ldsc.py')
if args.LD_script is None:
raise ValueError('--LD_script is required.')

if not isinstance(args.similarity, float):
raise TypeError('--similarity must be a float between 0.0 and 1.0')
if args.similarity is None or args.similarity < 0.0 or args.similarity > 1.0:
raise ValueError('--similarity is required - must be float between 0.0 and 1.0')

if not isinstance(args.min_MAF, float):
raise TypeError('--min_MAF must be a float between 0.0 and 1.0')
if args.min_MAF is None or args.min_MAF < 0.0 or args.min_MAF> 1.0:
raise ValueError('--min_MAF is required - must be float between 0.0 and 1.0')

if not isinstance(args.LDScore_window_autosome, float):
raise TypeError('--LDScore_window_autosome must be a float > 0.0')
if args.LDScore_window_autosome is None or args.LDScore_window_autosome < 0.0:
raise ValueError('--LDScore_window_autosome is required - must be float > 0.0')
if not isinstance(args.prune_window, int):
raise TypeError('--prune_window must be an int > 0')
if args.prune_window is None or args.prune_window <=0:
if not os.path.isfile(args.LD_script):
raise ValueError('--LD_script not found.')

if args.similarity is None:
raise ValueError('--similarity is required.')

if (args.similarity < 0.0) or (args.similarity > 1.0):
raise ValueError('--similarity must be float > 0.0 and < 1.0')

if args.min_MAF is None:
raise ValueError('--min_MAF is required.')

if (args.min_MAF < 0.0) or (args.min_MAF > 1.0):
raise ValueError('--min_MAF must be float > 0.0 and < 1.0')

if args.LDScore_window_autosome is None:
raise ValueError('--LDScore_window_autosome is required.')
if args.LDScore_window_autosome < 0.0:
raise ValueError('--LDScore_window_autosome must be float > 0.0')

if args.prune_window is None:
raise ValueError('--prune_window is required.')
if args.prune_window <= 0:
raise TypeError('--prune_window must be an integer > 0')

if not isinstance(args.prune_slide, int):
raise TypeError('--prune_slide must be an int > 0')
if args.prune_slide is None or args.prune_slide <=0:
if args.prune_slide is None:
raise ValueError('--prune_slide is required.')
if args.prune_slide <= 0:
raise ValueError('--prune_slide must be an int > 0')

if args.prune_cutoff is None:
raise ValueError('--prune_cutoff is required')
if (args.prune_cutoff < 0.0) or (args.prune_cutoff > 1.0):
raise TypeError('--prune_cutoff must be a float > 0.0 and < 1.0')

if args.clump_cutoff is None:
raise ValueError('--clump_cutoff is required.')
if (args.clump_cutoff < 0.0) or (args.clump_cutoff > 1.0):
raise ValueError('--clump_cutoff must be a float between 0.0 and 1.0')

if args.max_distance_clump is None:
raise ValueError('--max_distance_clump is required.')
if args.max_distance_clump <= 0:
raise TypeError('--max_distance_clump must be an int > 0')

if not isinstance(args.prune_cutoff, float):
raise TypeError('--prune_cutoff must be a float between 0.0 and 1.0')
if args.prune_cutoff is None or args.prune_cutoff < 0.0 or args.prune_cutoff > 1.0:
raise ValueError('--prune_cutoff is required - float between 0.0 and 1.0')
int_dir = args.int_dir

if not isinstance(args.clump_cutoff, float):
raise TypeError('--clump_cutoff must be a float between 0.0 and 1.0')
if args.clump_cutoff is None or args.clump_cutoff < 0.0 or args.clump_cutoff > 1.0:
raise ValueError('--clump_cutoff is required - float between 0.0 and 1.0')
if not os.path.exists(int_dir):
os.makedirs(int_dir)

if int_dir[-1] != '/':
int_dir += '/'

out_dir = args.out_dir

if not os.path.exists(out_dir):
os.makedirs(out_dir)

if out_dir[-1] != '/':
out_dir += '/'

if not isinstance(args.max_distance_clump, int):
raise TypeError('--max_distance_clump must be an int > 0')
if args.max_distance_clump is None or args.max_distance_clump <=0:
raise ValueError('--max_distance_clump is required - must be int > 0')


log = Logger(args.cwd + "output/" + args.chromosome+'.log')
log = Logger(os.path.join(args.out_dir, args.chrom + '.log'))

try:
defaults = vars(parser.parse_args(''))
opts = vars(args)
non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]]
header = MASTHEAD
header += 'build_fingerprint_maps.py \\\n'
options = ['--'+x.replace('_','-')+' '+str(opts[x])+' \\' for x in non_defaults]
options = ['--'+x.replace('_','-')+' '+str(opts[x])+' \\' for x in opts.keys()]
header += '\n'.join(options).replace('True','').replace('False','')
header = header[0:-1]+'\n'
log.log(header)
Expand Down
Loading

0 comments on commit 852db24

Please sign in to comment.