1
1
#!/usr/bin/env python3
2
2
3
3
import argparse
4
- import gzip
5
4
import re
6
5
from concurrent .futures import ProcessPoolExecutor , as_completed
7
6
from Bio import SeqIO
8
7
9
8
# Command line argument processing
10
9
def process_cli ():
11
- parser = argparse .ArgumentParser (description = 'Generates a GFF file containing the locations of all GATC sites in the genome sequence' )
10
+ parser = argparse .ArgumentParser (
11
+ description = """
12
+ Generates a GFF file containing the locations of all motif sites in the genome sequence
13
+ """
14
+ )
12
15
parser .add_argument ("--fasta" , "-f" ,
13
16
type = str ,
14
- help = ' Input genome FASTA file' )
17
+ help = " Input genome FASTA file" )
15
18
parser .add_argument ("--outfile" , "-o" ,
16
19
type = str ,
17
20
required = True ,
@@ -22,7 +25,7 @@ def process_cli():
22
25
choices = ["hg38" , "mm38" , "dm6" ,
23
26
"dm3" , "hg19" , "mm39" ],
24
27
required = True ,
25
- help = ' Genome build (default: hg38)' )
28
+ help = " Genome build (default: hg38)" )
26
29
parser .add_argument ("--threads" , "-t" ,
27
30
type = int ,
28
31
default = 1 ,
@@ -43,16 +46,13 @@ def process_cli():
43
46
def regex_patterns (genome ):
44
47
# Dictionary to store scaffold and mitochondrial regex patterns
45
48
patterns = {
46
- "hg38" : [r"" ,"" ],
47
- "hg19" : [r"" ,"" ],
48
- "mm38" : [r"" ,"" ],
49
- "mm39" : [r"" ,"" ],
49
+ "hg38" : [r"^KI|^GL" ,"MT" ],
50
+ "hg19" : [r"^GL" ,"MT" ],
51
+ "mm39" : [r"^JH|^GL|^MU" ,"MT" ],
50
52
"dm6" : [r"Scaffold|\d{15}" ,"mitochondrion_genome" ],
51
- "dm3" : [r"" ,"" ],
52
53
}
53
54
return patterns [genome ]
54
55
55
-
56
56
def load_fasta (fasta , args ):
57
57
"""
58
58
Load fasta file and return dict with chr as key and sequence as value.
@@ -71,11 +71,10 @@ def load_fasta(fasta, args):
71
71
chr_seq = {k : v for k , v in chr_seq .items () if not re .search (mito_pattern , k , re .IGNORECASE )}
72
72
73
73
return chr_seq
74
-
75
-
74
+
76
75
# Generate track function
77
76
def generate_track (args ):
78
- motif = "GATC"
77
+ motif = args . motif
79
78
motif_len = len (motif )
80
79
81
80
fasta = args .fasta
@@ -101,16 +100,11 @@ def generate_track(args):
101
100
for result in results :
102
101
track .write (result )
103
102
104
- print ("All done!" )
105
-
106
-
107
103
# Process function
108
104
def process (chr_name , seq , motif , motif_len ):
109
- #print(f"Processing {chr_name} ...", file=sys.stderr)
110
105
results = motif_hash (seq , chr_name , motif , motif_len )
111
106
return chr_name , results
112
107
113
-
114
108
# Motif hash function
115
109
def motif_hash (seq , chr_name , motif , motif_len ):
116
110
results = []
@@ -125,7 +119,9 @@ def motif_hash(seq, chr_name, motif, motif_len):
125
119
# Main function
126
120
def main ():
127
121
args = process_cli ()
122
+ print (f"Mapping { args .motif } sites in { args .fasta } " )
128
123
generate_track (args )
124
+ print ("All done!" )
129
125
130
126
if __name__ == "__main__" :
131
127
main ()
0 commit comments