-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathconversion_script.py
106 lines (94 loc) · 3.39 KB
/
conversion_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# encoding: utf-8
"""
Author: Anna Lewis
(C) 2014
"""
import sys
import time
import gzip
import os.path
date = time.strftime("%x")
def get_alt_and_genotype(ref, alleles):
a = alleles[0]
if len(alleles) == 1:
if a.lower() == ref.lower():
alt = "."
genotype = 0
else:
alt = a
genotype = 1
else:
b = alleles[1]
if a.lower() == ref.lower() and b.lower() == ref.lower():
alt = "."
genotype = "0/0"
elif a.lower() == ref.lower():
alt = b
genotype = "0/1"
elif b.lower() == ref.lower():
alt = a
genotype = "0/1"
elif a.lower() == b.lower():
alt = a
genotype = "1/1"
else:
alt = "{0},{1}".format(a, b)
genotype = "1/2"
return alt, genotype
def main(argv):
if len(argv) != 3:
sys.exit("Usage is 'python conversion_script.py "
"path/to/23andme_input_file.txt path/to/vcf_output_file.txt'")
input_filename, output_filename = sys.argv[1:]
input_data = open(input_filename, 'r')
if os.path.isfile(output_filename):
sys.exit("Won't clobber existing file {0}".format(output_filename))
output_data = open(output_filename, 'w')
# Load reference genome information
ref_path = "23andme_reference_genome.txt.gz"
ref_dict = {}
with gzip.open(ref_path, 'r') as ref_file:
for line in ref_file:
[ref_chr, ref_pos, ref_base] = line.strip().split()
ref_dict["{0}.{1}".format(ref_chr, ref_pos)] = ref_base
# Write VCF header
header = """##fileformat=VCFv4.1
##fileDate={0}
##source=https://github.com/acflewis/23andme_to_VCF.git
##reference=file://{1}
##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tGENOTYPE
""".format(date, ref_path)
output_data.write(header)
pass_count = 0
# Process 23andme data, one line at a time
for line in input_data.readlines():
if not line.startswith('#'):
[rsid, chr, pos, alleles] = line.rstrip().split()
#skip current line if the call was "--", or an indel
if (alleles == "--" or alleles == "D" or alleles == "I" or
alleles == "DI" or alleles == "DD" or
alleles == "ID" or alleles == "II"):
continue
# Ensure chromosomes named the same as in reference file
if (chr == "MT"):
chr = "M"
chr = "chr{0}".format(chr)
# Get the reference base
ref = ref_dict.get("{0}.{1}".format(chr, pos), "pass")
if (ref == "pass"):
pass_count += 1
continue
# Get the genotype
[alt, genotype] = get_alt_and_genotype(ref, alleles)
output_data.write('{0}\t{1}\t{2}\t{3}\t{4}\t.'
'\t.\t.\tGT\t{5}\n'.format(chr, pos, rsid,
ref.upper(),
alt.upper(),
genotype))
input_data.close()
output_data.close()
print("There were {0} variants that were not"
" matched in the reference file".format(pass_count))
if __name__ == '__main__':
main(sys.argv)